himipo commited on
Commit
63e9186
·
1 Parent(s): 11aa70b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. configs/base/dataloader.yml +39 -0
  2. configs/base/deim.yml +48 -0
  3. configs/base/deimv2.yml +144 -0
  4. configs/base/dfine_hgnetv2.yml +90 -0
  5. configs/base/optimizer.yml +35 -0
  6. configs/base/rt_deim.yml +49 -0
  7. configs/base/rt_optimizer.yml +37 -0
  8. configs/base/rtdetrv2_r50vd.yml +90 -0
  9. configs/{coco_detection.yml → dataset/coco_detection.yml} +0 -0
  10. configs/dataset/crowdhuman_detection.yml +41 -0
  11. configs/dataset/custom_detection.yml +41 -0
  12. configs/dataset/obj365_detection.yml +41 -0
  13. configs/dataset/voc_detection.yml +40 -0
  14. configs/deim_dfine/deim_hgnetv2_l_coco.yml +37 -0
  15. configs/deim_dfine/deim_hgnetv2_m_coco.yml +39 -0
  16. configs/deim_dfine/deim_hgnetv2_n_coco.yml +44 -0
  17. configs/deim_dfine/deim_hgnetv2_s_coco.yml +39 -0
  18. configs/deim_dfine/deim_hgnetv2_x_coco.yml +37 -0
  19. configs/deim_dfine/dfine_hgnetv2_l_coco.yml +44 -0
  20. configs/deim_dfine/dfine_hgnetv2_m_coco.yml +60 -0
  21. configs/deim_dfine/dfine_hgnetv2_n_coco.yml +82 -0
  22. configs/deim_dfine/dfine_hgnetv2_s_coco.yml +61 -0
  23. configs/deim_dfine/dfine_hgnetv2_x_coco.yml +56 -0
  24. configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml +50 -0
  25. configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml +57 -0
  26. configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml +36 -0
  27. configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml +32 -0
  28. configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml +36 -0
  29. configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml +35 -0
  30. configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml +39 -0
  31. configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml +40 -0
  32. configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml +44 -0
  33. configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml +57 -0
  34. configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml +25 -0
  35. configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml +43 -0
  36. configs/deimv2/deimv2_dinov3_l_coco.yml +104 -0
  37. configs/deimv2/deimv2_dinov3_m_coco.yml +107 -0
  38. configs/{deimv2_floorplan.yaml → deimv2/deimv2_dinov3_s_coco.yml} +31 -112
  39. configs/deimv2/deimv2_dinov3_x_coco.yml +94 -0
  40. configs/deimv2/deimv2_hgnetv2_atto_coco.yml +123 -0
  41. configs/deimv2/deimv2_hgnetv2_femto_coco.yml +128 -0
  42. configs/deimv2/deimv2_hgnetv2_l_coco.yml +24 -0
  43. configs/deimv2/deimv2_hgnetv2_m_coco.yml +72 -0
  44. configs/deimv2/deimv2_hgnetv2_n_coco.yml +96 -0
  45. configs/deimv2/deimv2_hgnetv2_pico_coco.yml +128 -0
  46. configs/deimv2/deimv2_hgnetv2_s_coco.yml +76 -0
  47. configs/deimv2/deimv2_hgnetv2_x_coco.yml +60 -0
  48. configs/runtime.yml +20 -0
  49. engine/__init__.py +13 -10
  50. engine/backbone/vit_tiny.py +15 -40
configs/base/dataloader.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ train_dataloader:
3
+ dataset:
4
+ transforms:
5
+ ops:
6
+ - {type: RandomPhotometricDistort, p: 0.5}
7
+ - {type: RandomZoomOut, fill: 0}
8
+ - {type: RandomIoUCrop, p: 0.8}
9
+ - {type: SanitizeBoundingBoxes, min_size: 1}
10
+ - {type: RandomHorizontalFlip}
11
+ - {type: Resize, size: [640, 640], }
12
+ - {type: SanitizeBoundingBoxes, min_size: 1}
13
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
14
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
15
+ policy:
16
+ name: stop_epoch
17
+ epoch: 72 # epoch in [71, ~) stop `ops`
18
+ ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] # Mosaicを除外
19
+
20
+ collate_fn:
21
+ type: BatchImageCollateFunction
22
+ base_size: 640
23
+ base_size_repeat: 3
24
+ stop_epoch: 72 # epoch in [72, ~) stop `multiscales`
25
+
26
+ shuffle: True
27
+ total_batch_size: 32 # total batch size equals to 32 (4 * 8)
28
+ num_workers: 4
29
+
30
+
31
+ val_dataloader:
32
+ dataset:
33
+ transforms:
34
+ ops:
35
+ - {type: Resize, size: [640, 640], }
36
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
37
+ shuffle: False
38
+ total_batch_size: 64
39
+ num_workers: 4
configs/base/deim.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dense O2O
2
+ train_dataloader:
3
+ dataset:
4
+ transforms:
5
+ ops:
6
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
7
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
8
+ - {type: RandomPhotometricDistort, p: 0.5}
9
+ - {type: RandomZoomOut, fill: 0}
10
+ - {type: RandomIoUCrop, p: 0.8}
11
+ - {type: SanitizeBoundingBoxes, min_size: 1}
12
+ - {type: RandomHorizontalFlip}
13
+ - {type: Resize, size: [640, 640], }
14
+ - {type: SanitizeBoundingBoxes, min_size: 1}
15
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
16
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
17
+ policy:
18
+ epoch: [4, 29, 50] # list
19
+ ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
20
+ mosaic_prob: 0.5
21
+
22
+ collate_fn:
23
+ mixup_prob: 0.5
24
+ mixup_epochs: [4, 29]
25
+ stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
26
+
27
+ # Unfreezing BN
28
+ HGNetv2:
29
+ freeze_at: -1 # 0 default
30
+ freeze_norm: False # True default
31
+
32
+ # Activation
33
+ DFINETransformer:
34
+ activation: silu
35
+ mlp_act: silu
36
+
37
+ ## Our LR-Scheduler
38
+ lrsheduler: flatcosine
39
+ lr_gamma: 0.5
40
+ warmup_iter: 2000
41
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
42
+ no_aug_epoch: 8
43
+
44
+ ## Our Loss
45
+ DEIMCriterion:
46
+ weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
47
+ losses: ['mal', 'boxes', 'local']
48
+ gamma: 1.5
configs/base/deimv2.yml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ model: DEIM
4
+ criterion: DEIMCriterion
5
+ postprocessor: PostProcessor
6
+
7
+ use_focal_loss: True
8
+ eval_spatial_size: [640, 640] # h w
9
+ checkpoint_freq: 5 # save freq
10
+
11
+ DEIM:
12
+ backbone: HGNetv2
13
+ encoder: HybridEncoder
14
+ decoder: DEIMTransformer
15
+
16
+ HGNetv2:
17
+ name: 'B4'
18
+ return_idx: [1, 2, 3]
19
+ freeze_at: -1 # 0 default
20
+ freeze_stem_only: True
21
+ freeze_norm: False # True default
22
+ pretrained: True
23
+ local_model_dir: ./weight/hgnetv2/
24
+
25
+ HybridEncoder:
26
+ in_channels: [512, 1024, 2048]
27
+ feat_strides: [8, 16, 32]
28
+
29
+ # intra
30
+ hidden_dim: 256
31
+ use_encoder_idx: [2]
32
+ num_encoder_layers: 1
33
+ nhead: 8
34
+ dim_feedforward: 1024
35
+ dropout: 0.
36
+ enc_act: 'gelu'
37
+
38
+ # cross
39
+ expansion: 1.0
40
+ depth_mult: 1
41
+ act: 'silu'
42
+
43
+ # New
44
+ version: deim
45
+ csp_type: csp2
46
+ fuse_op: sum
47
+
48
+ DEIMTransformer:
49
+ feat_channels: [256, 256, 256]
50
+ feat_strides: [8, 16, 32]
51
+ hidden_dim: 256
52
+ num_levels: 3
53
+
54
+ num_layers: 6
55
+ eval_idx: -1
56
+ num_queries: 300
57
+
58
+ num_denoising: 100
59
+ label_noise_ratio: 0.5
60
+ box_noise_scale: 1.0
61
+
62
+ reg_max: 32
63
+ reg_scale: 4
64
+ layer_scale: 1 # 2
65
+
66
+ num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
67
+ cross_attn_method: default # default, discrete
68
+ query_select_method: default # default, agnostic
69
+
70
+ # Act
71
+ activation: silu
72
+ mlp_act: silu
73
+
74
+ # FFN
75
+ dim_feedforward: 2048
76
+
77
+ PostProcessor:
78
+ num_top_queries: 300
79
+
80
+
81
+ ## DEIM LR-Scheduler
82
+ epoches: 58 # 72 + 2n # Increase to search for the optimal ema
83
+
84
+ lrsheduler: flatcosine
85
+ lr_gamma: 0.5
86
+ warmup_iter: 2000
87
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
88
+ no_aug_epoch: 8
89
+
90
+ ## Dense O2O: Mosaic + Mixup + CopyBlend
91
+ train_dataloader:
92
+ dataset:
93
+ transforms:
94
+ ops:
95
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
96
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
97
+ - {type: RandomPhotometricDistort, p: 0.5}
98
+ - {type: RandomZoomOut, fill: 0}
99
+ - {type: RandomIoUCrop, p: 0.8}
100
+ - {type: SanitizeBoundingBoxes, min_size: 1}
101
+ - {type: RandomHorizontalFlip}
102
+ - {type: Resize, size: [640, 640], }
103
+ - {type: SanitizeBoundingBoxes, min_size: 1}
104
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
105
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
106
+ # Mosaic options
107
+ policy:
108
+ epoch: [4, 29, 50] # list
109
+ ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
110
+ mosaic_prob: 0.5
111
+
112
+ collate_fn:
113
+ # Mixup options
114
+ mixup_prob: 0.5
115
+ mixup_epochs: [4, 29]
116
+ stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
117
+ # CopyBlend options
118
+ copyblend_prob: 0.5
119
+ copyblend_epochs: [4, 50]
120
+ area_threshold: 100
121
+ num_objects: 3
122
+ with_expand: True
123
+ expand_ratios: [0.1, 0.25]
124
+
125
+ ema_restart_decay: 0.9999
126
+ base_size_repeat: 4
127
+
128
+ ## DEIM Loss
129
+ DEIMCriterion:
130
+ weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
131
+ losses: ['mal', 'boxes', 'local']
132
+ gamma: 1.5
133
+ alpha: 0.75
134
+ reg_max: 32
135
+
136
+ matcher:
137
+ type: HungarianMatcher
138
+ weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
139
+ alpha: 0.25
140
+ gamma: 2.0
141
+ # change matcher
142
+ change_matcher: True
143
+ iou_order_alpha: 4.0
144
+ matcher_change_epoch: 45
configs/base/dfine_hgnetv2.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ model: DEIM
4
+ criterion: DEIMCriterion
5
+ postprocessor: PostProcessor
6
+
7
+ use_focal_loss: True
8
+ eval_spatial_size: [640, 640] # h w
9
+ checkpoint_freq: 4 # save freq
10
+
11
+ DEIM:
12
+ backbone: HGNetv2
13
+ encoder: HybridEncoder
14
+ decoder: DFINETransformer
15
+
16
+ # Add, default for step lr scheduler
17
+ lrsheduler: flatcosine
18
+ lr_gamma: 1
19
+ warmup_iter: 500
20
+ flat_epoch: 4000000
21
+ no_aug_epoch: 0
22
+
23
+ HGNetv2:
24
+ pretrained: True
25
+ local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/
26
+
27
+ HybridEncoder:
28
+ in_channels: [512, 1024, 2048]
29
+ feat_strides: [8, 16, 32]
30
+
31
+ # intra
32
+ hidden_dim: 256
33
+ use_encoder_idx: [2]
34
+ num_encoder_layers: 1
35
+ nhead: 8
36
+ dim_feedforward: 1024
37
+ dropout: 0.
38
+ enc_act: 'gelu'
39
+
40
+ # cross
41
+ expansion: 1.0
42
+ depth_mult: 1
43
+ act: 'silu'
44
+
45
+
46
+ DFINETransformer:
47
+ feat_channels: [256, 256, 256]
48
+ feat_strides: [8, 16, 32]
49
+ hidden_dim: 256
50
+ num_levels: 3
51
+
52
+ num_layers: 6
53
+ eval_idx: -1
54
+ num_queries: 300
55
+
56
+ num_denoising: 100
57
+ label_noise_ratio: 0.5
58
+ box_noise_scale: 1.0
59
+
60
+ # NEW
61
+ reg_max: 32
62
+ reg_scale: 4
63
+
64
+ # Auxiliary decoder layers dimension scaling
65
+ # "eg. If num_layers: 6 eval_idx: -4,
66
+ # then layer 3, 4, 5 are auxiliary decoder layers."
67
+ layer_scale: 1 # 2
68
+
69
+
70
+ num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
71
+ cross_attn_method: default # default, discrete
72
+ query_select_method: default # default, agnostic
73
+
74
+
75
+ PostProcessor:
76
+ num_top_queries: 300
77
+
78
+
79
+ DEIMCriterion:
80
+ weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
81
+ losses: ['vfl', 'boxes', 'local']
82
+ alpha: 0.75
83
+ gamma: 2.0
84
+ reg_max: 32
85
+
86
+ matcher:
87
+ type: HungarianMatcher
88
+ weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
89
+ alpha: 0.25
90
+ gamma: 2.0
configs/base/optimizer.yml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use_amp: True
2
+ use_ema: True
3
+ ema:
4
+ type: ModelEMA
5
+ decay: 0.9999
6
+ warmups: 1000
7
+ start: 0
8
+
9
+ epoches: 72
10
+ clip_max_norm: 0.1
11
+
12
+
13
+ optimizer:
14
+ type: AdamW
15
+ params:
16
+ -
17
+ params: '^(?=.*backbone)(?!.*norm).*$'
18
+ lr: 0.0000125
19
+ -
20
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
21
+ weight_decay: 0.
22
+
23
+ lr: 0.00025
24
+ betas: [0.9, 0.999]
25
+ weight_decay: 0.000125
26
+
27
+
28
+ lr_scheduler:
29
+ type: MultiStepLR
30
+ milestones: [500]
31
+ gamma: 0.1
32
+
33
+ lr_warmup_scheduler:
34
+ type: LinearWarmup
35
+ warmup_duration: 500
configs/base/rt_deim.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dense O2O
2
+ train_dataloader:
3
+ dataset:
4
+ transforms:
5
+ ops:
6
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
7
+ probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True}
8
+ - {type: RandomPhotometricDistort, p: 0.5}
9
+ - {type: RandomZoomOut, fill: 0}
10
+ - {type: RandomIoUCrop, p: 0.8}
11
+ - {type: SanitizeBoundingBoxes, min_size: 1}
12
+ - {type: RandomHorizontalFlip}
13
+ - {type: Resize, size: [640, 640], }
14
+ - {type: SanitizeBoundingBoxes, min_size: 1}
15
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
16
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
17
+ policy:
18
+ epoch: [4, 29, 50] # list
19
+ ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
20
+ mosaic_prob: 0.5
21
+
22
+ collate_fn:
23
+ mixup_prob: 0.5
24
+ mixup_epochs: [4, 29]
25
+ stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
26
+
27
+ # Unfreezing BN
28
+ PResNet:
29
+ freeze_at: -1 # default 0
30
+ freeze_norm: False # default True
31
+
32
+ # Activation
33
+ RTDETRTransformerv2:
34
+ query_pos_method: as_reg
35
+ activation: silu
36
+ mlp_act: silu
37
+
38
+ ## Our LR-Scheduler
39
+ lrsheduler: flatcosine
40
+ lr_gamma: 0.5
41
+ warmup_iter: 2000
42
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
43
+ no_aug_epoch: 8
44
+
45
+ ## Our Loss
46
+ DEIMCriterion:
47
+ weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2}
48
+ losses: ['mal', 'boxes', ]
49
+ gamma: 1.5
configs/base/rt_optimizer.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use_amp: True
2
+ use_ema: True
3
+ ema:
4
+ type: ModelEMA
5
+ decay: 0.9999
6
+ warmups: 2000
7
+ start: 0
8
+
9
+ epoches: 72
10
+ clip_max_norm: 0.1
11
+
12
+ train_dataloader:
13
+ total_batch_size: 16
14
+
15
+ optimizer:
16
+ type: AdamW
17
+ params:
18
+ -
19
+ params: '^(?=.*backbone)(?!.*norm).*$'
20
+ lr: 0.00001
21
+ -
22
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
23
+ weight_decay: 0.
24
+
25
+ lr: 0.0001
26
+ betas: [0.9, 0.999]
27
+ weight_decay: 0.0001
28
+
29
+ lr_scheduler:
30
+ type: MultiStepLR
31
+ milestones: [1000]
32
+ gamma: 0.1
33
+
34
+
35
+ lr_warmup_scheduler:
36
+ type: LinearWarmup
37
+ warmup_duration: 2000
configs/base/rtdetrv2_r50vd.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ model: DEIM
4
+ criterion: DEIMCriterion
5
+ postprocessor: PostProcessor
6
+
7
+ use_focal_loss: True
8
+ eval_spatial_size: [640, 640] # h w
9
+ checkpoint_freq: 4 # save freq
10
+
11
+ DEIM:
12
+ backbone: PResNet
13
+ encoder: HybridEncoder
14
+ decoder: RTDETRTransformerv2
15
+
16
+
17
+ # Add, default for step lr scheduler
18
+ lrsheduler: flatcosine
19
+ lr_gamma: 1
20
+ warmup_iter: 2000
21
+ flat_epoch: 4000000
22
+ no_aug_epoch: 0
23
+
24
+ PResNet:
25
+ depth: 50
26
+ variant: d
27
+ freeze_at: 0
28
+ return_idx: [1, 2, 3]
29
+ num_stages: 4
30
+ freeze_norm: True
31
+ pretrained: True
32
+ local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/
33
+
34
+
35
+ HybridEncoder:
36
+ in_channels: [512, 1024, 2048]
37
+ feat_strides: [8, 16, 32]
38
+
39
+ # intra
40
+ hidden_dim: 256
41
+ use_encoder_idx: [2]
42
+ num_encoder_layers: 1
43
+ nhead: 8
44
+ dim_feedforward: 1024
45
+ dropout: 0.
46
+ enc_act: 'gelu'
47
+
48
+ # cross
49
+ expansion: 1.0
50
+ depth_mult: 1
51
+ act: 'silu'
52
+ version: rt_detrv2 # pay attention to this
53
+
54
+
55
+ RTDETRTransformerv2:
56
+ feat_channels: [256, 256, 256]
57
+ feat_strides: [8, 16, 32]
58
+ hidden_dim: 256
59
+ num_levels: 3
60
+
61
+ num_layers: 6
62
+ num_queries: 300
63
+
64
+ num_denoising: 100
65
+ label_noise_ratio: 0.5
66
+ box_noise_scale: 1.0 # 1.0 0.4
67
+
68
+ eval_idx: -1
69
+
70
+ # NEW, can be chosen
71
+ num_points: [4, 4, 4] # [3,3,3] [2,2,2]
72
+ cross_attn_method: default # default, discrete
73
+ query_select_method: default # default, agnostic
74
+
75
+
76
+ PostProcessor:
77
+ num_top_queries: 300
78
+
79
+ DEIMCriterion:
80
+ weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
81
+ losses: ['vfl', 'boxes', ]
82
+ alpha: 0.75
83
+ gamma: 2.0
84
+ use_uni_set: False
85
+
86
+ matcher:
87
+ type: HungarianMatcher
88
+ weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
89
+ alpha: 0.25
90
+ gamma: 2.0
configs/{coco_detection.yml → dataset/coco_detection.yml} RENAMED
File without changes
configs/dataset/crowdhuman_detection.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ evaluator:
4
+ type: CocoEvaluator
5
+ iou_types: ['bbox', ]
6
+
7
+ num_classes: 2 # your dataset classes
8
+ remap_mscoco_category: False
9
+
10
+ train_dataloader:
11
+ type: DataLoader
12
+ dataset:
13
+ type: CocoDetection
14
+ img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train
15
+ ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json
16
+ return_masks: False
17
+ transforms:
18
+ type: Compose
19
+ ops: ~
20
+ shuffle: True
21
+ num_workers: 4
22
+ drop_last: True
23
+ collate_fn:
24
+ type: BatchImageCollateFunction
25
+
26
+
27
+ val_dataloader:
28
+ type: DataLoader
29
+ dataset:
30
+ type: CocoDetection
31
+ img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val
32
+ ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json
33
+ return_masks: False
34
+ transforms:
35
+ type: Compose
36
+ ops: ~
37
+ shuffle: False
38
+ num_workers: 4
39
+ drop_last: False
40
+ collate_fn:
41
+ type: BatchImageCollateFunction
configs/dataset/custom_detection.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ evaluator:
4
+ type: CocoEvaluator
5
+ iou_types: ['bbox', ]
6
+
7
+ num_classes: 777 # your dataset classes
8
+ remap_mscoco_category: False
9
+
10
+ train_dataloader:
11
+ type: DataLoader
12
+ dataset:
13
+ type: CocoDetection
14
+ img_folder: /data/yourdataset/train
15
+ ann_file: /data/yourdataset/train/train.json
16
+ return_masks: False
17
+ transforms:
18
+ type: Compose
19
+ ops: ~
20
+ shuffle: True
21
+ num_workers: 4
22
+ drop_last: True
23
+ collate_fn:
24
+ type: BatchImageCollateFunction
25
+
26
+
27
+ val_dataloader:
28
+ type: DataLoader
29
+ dataset:
30
+ type: CocoDetection
31
+ img_folder: /data/yourdataset/val
32
+ ann_file: /data/yourdataset/val/val.json
33
+ return_masks: False
34
+ transforms:
35
+ type: Compose
36
+ ops: ~
37
+ shuffle: False
38
+ num_workers: 4
39
+ drop_last: False
40
+ collate_fn:
41
+ type: BatchImageCollateFunction
configs/dataset/obj365_detection.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ evaluator:
4
+ type: CocoEvaluator
5
+ iou_types: ['bbox', ]
6
+
7
+ num_classes: 366
8
+ remap_mscoco_category: False
9
+
10
+ train_dataloader:
11
+ type: DataLoader
12
+ dataset:
13
+ type: CocoDetection
14
+ img_folder: /home/Dataset/objects365/train
15
+ ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json
16
+ return_masks: False
17
+ transforms:
18
+ type: Compose
19
+ ops: ~
20
+ shuffle: True
21
+ num_workers: 4
22
+ drop_last: True
23
+ collate_fn:
24
+ type: BatchImageCollateFunction
25
+
26
+
27
+ val_dataloader:
28
+ type: DataLoader
29
+ dataset:
30
+ type: CocoDetection
31
+ img_folder: /home/Dataset/objects365/val
32
+ ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json
33
+ return_masks: False
34
+ transforms:
35
+ type: Compose
36
+ ops: ~
37
+ shuffle: False
38
+ num_workers: 4
39
+ drop_last: False
40
+ collate_fn:
41
+ type: BatchImageCollateFunction
configs/dataset/voc_detection.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: detection
2
+
3
+ evaluator:
4
+ type: CocoEvaluator
5
+ iou_types: ['bbox', ]
6
+
7
+ num_classes: 20
8
+
9
+ train_dataloader:
10
+ type: DataLoader
11
+ dataset:
12
+ type: VOCDetection
13
+ root: ./dataset/voc/
14
+ ann_file: trainval.txt
15
+ label_file: label_list.txt
16
+ transforms:
17
+ type: Compose
18
+ ops: ~
19
+ shuffle: True
20
+ num_workers: 4
21
+ drop_last: True
22
+ collate_fn:
23
+ type: BatchImageCollateFunction
24
+
25
+
26
+ val_dataloader:
27
+ type: DataLoader
28
+ dataset:
29
+ type: VOCDetection
30
+ root: ./dataset/voc/
31
+ ann_file: test.txt
32
+ label_file: label_list.txt
33
+ transforms:
34
+ type: Compose
35
+ ops: ~
36
+ shuffle: False
37
+ num_workers: 4
38
+ drop_last: False
39
+ collate_fn:
40
+ type: BatchImageCollateFunction
configs/deim_dfine/deim_hgnetv2_l_coco.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_l_coco.yml',
3
+ '../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_hgnetv2_l_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
13
+ lr: 0.000025
14
+ -
15
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0005
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.000125
21
+
22
+ # Increase to search for the optimal ema
23
+ epoches: 58 # 72 + 2n
24
+
25
+ ## Our LR-Scheduler
26
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
27
+ no_aug_epoch: 8
28
+
29
+ train_dataloader:
30
+ dataset:
31
+ transforms:
32
+ policy:
33
+ epoch: [4, 29, 50] # list
34
+
35
+ collate_fn:
36
+ mixup_epochs: [4, 29]
37
+ stop_epoch: 50
configs/deim_dfine/deim_hgnetv2_m_coco.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_m_coco.yml',
3
+ '../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_hgnetv2_m_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*bn).*$'
13
+ lr: 0.00004
14
+ -
15
+ params: '^(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0004
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.0001
21
+
22
+
23
+ # Increase to search for the optimal ema
24
+ epoches: 102 # 120 + 4n
25
+
26
+ ## Our LR-Scheduler
27
+ flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
28
+ no_aug_epoch: 12
29
+
30
+ ## Our DataAug
31
+ train_dataloader:
32
+ dataset:
33
+ transforms:
34
+ policy:
35
+ epoch: [4, 49, 90] # list
36
+
37
+ collate_fn:
38
+ mixup_epochs: [4, 49]
39
+ stop_epoch: 90
configs/deim_dfine/deim_hgnetv2_n_coco.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_n_coco.yml',
3
+ '../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./deim_outputs/deim_hgnetv2_n_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
13
+ lr: 0.0004
14
+ -
15
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
16
+ lr: 0.0004
17
+ weight_decay: 0.
18
+ -
19
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
20
+ weight_decay: 0.
21
+
22
+ lr: 0.0008
23
+ betas: [0.9, 0.999]
24
+ weight_decay: 0.0001
25
+
26
+ # Increase to search for the optimal ema
27
+ epoches: 160 # 148 + 12
28
+
29
+ ## Our LR-Scheduler
30
+ flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
31
+ no_aug_epoch: 12
32
+ lr_gamma: 1.0
33
+
34
+ ## Our DataAug
35
+ train_dataloader:
36
+ dataset:
37
+ transforms:
38
+ policy:
39
+ epoch: [4, 78, 148] # list
40
+
41
+ collate_fn:
42
+ mixup_epochs: [4, 78]
43
+ stop_epoch: 148
44
+ base_size_repeat: ~
configs/deim_dfine/deim_hgnetv2_s_coco.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_s_coco.yml',
3
+ '../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_hgnetv2_s_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*bn).*$'
13
+ lr: 0.0002
14
+ -
15
+ params: '^(?=.*(?:norm|bn)).*$' # except bias
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0004
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.0001
21
+
22
+
23
+ # Increase to search for the optimal ema
24
+ epoches: 132 # 120 + 4n
25
+
26
+ ## Our LR-Scheduler
27
+ flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
28
+ no_aug_epoch: 12
29
+
30
+ ## Our DataAug
31
+ train_dataloader:
32
+ dataset:
33
+ transforms:
34
+ policy:
35
+ epoch: [4, 64, 120] # list
36
+
37
+ collate_fn:
38
+ mixup_epochs: [4, 64]
39
+ stop_epoch: 120
configs/deim_dfine/deim_hgnetv2_x_coco.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_x_coco.yml',
3
+ '../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_hgnetv2_x_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
13
+ lr: 0.000005
14
+ -
15
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0005
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.000125
21
+
22
+ # Increase to search for the optimal ema
23
+ epoches: 58 # 72 + 2n
24
+
25
+ ## Our LR-Scheduler
26
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
27
+ no_aug_epoch: 8
28
+
29
+ train_dataloader:
30
+ dataset:
31
+ transforms:
32
+ policy:
33
+ epoch: [4, 29, 50] # list
34
+
35
+ collate_fn:
36
+ mixup_epochs: [4, 29]
37
+ stop_epoch: 50
configs/deim_dfine/dfine_hgnetv2_l_coco.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/dfine_hgnetv2_l_coco
10
+
11
+
12
+ HGNetv2:
13
+ name: 'B4'
14
+ return_idx: [1, 2, 3]
15
+ freeze_stem_only: True
16
+ freeze_at: 0
17
+ freeze_norm: True
18
+
19
+ optimizer:
20
+ type: AdamW
21
+ params:
22
+ -
23
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
24
+ lr: 0.0000125
25
+ -
26
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
27
+ weight_decay: 0.
28
+
29
+ lr: 0.00025
30
+ betas: [0.9, 0.999]
31
+ weight_decay: 0.000125
32
+
33
+
34
+ # Increase to search for the optimal ema
35
+ epoches: 80 # 72 + 2n
36
+ train_dataloader:
37
+ dataset:
38
+ transforms:
39
+ policy:
40
+ epoch: 72
41
+ collate_fn:
42
+ stop_epoch: 72
43
+ ema_restart_decay: 0.9999
44
+ base_size_repeat: 4
configs/deim_dfine/dfine_hgnetv2_m_coco.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./output/dfine_hgnetv2_m_coco
10
+
11
+
12
+ DEIM:
13
+ backbone: HGNetv2
14
+
15
+ HGNetv2:
16
+ name: 'B2'
17
+ return_idx: [1, 2, 3]
18
+ freeze_at: -1
19
+ freeze_norm: False
20
+ use_lab: True
21
+
22
+ DFINETransformer:
23
+ num_layers: 4 # 5 6
24
+ eval_idx: -1 # -2 -3
25
+
26
+ HybridEncoder:
27
+ in_channels: [384, 768, 1536]
28
+ hidden_dim: 256
29
+ depth_mult: 0.67
30
+
31
+ optimizer:
32
+ type: AdamW
33
+ params:
34
+ -
35
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
36
+ lr: 0.00002
37
+ -
38
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
39
+ lr: 0.00002
40
+ weight_decay: 0.
41
+ -
42
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
43
+ weight_decay: 0.
44
+
45
+ lr: 0.0002
46
+ betas: [0.9, 0.999]
47
+ weight_decay: 0.0001
48
+
49
+
50
+ # Increase to search for the optimal ema
51
+ epoches: 132 # 120 + 4n
52
+ train_dataloader:
53
+ dataset:
54
+ transforms:
55
+ policy:
56
+ epoch: 120
57
+ collate_fn:
58
+ stop_epoch: 120
59
+ ema_restart_decay: 0.9999
60
+ base_size_repeat: 6
configs/deim_dfine/dfine_hgnetv2_n_coco.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./output/dfine_hgnetv2_n_coco
10
+
11
+
12
+ DEIM:
13
+ backbone: HGNetv2
14
+
15
+ HGNetv2:
16
+ name: 'B0'
17
+ return_idx: [2, 3]
18
+ freeze_at: -1
19
+ freeze_norm: False
20
+ use_lab: True
21
+
22
+
23
+ HybridEncoder:
24
+ in_channels: [512, 1024]
25
+ feat_strides: [16, 32]
26
+
27
+ # intra
28
+ hidden_dim: 128
29
+ use_encoder_idx: [1]
30
+ dim_feedforward: 512
31
+
32
+ # cross
33
+ expansion: 0.34
34
+ depth_mult: 0.5
35
+
36
+
37
+ DFINETransformer:
38
+ feat_channels: [128, 128]
39
+ feat_strides: [16, 32]
40
+ hidden_dim: 128
41
+ dim_feedforward: 512
42
+ num_levels: 2
43
+
44
+ num_layers: 3
45
+ eval_idx: -1
46
+
47
+ num_points: [6, 6]
48
+
49
+ optimizer:
50
+ type: AdamW
51
+ params:
52
+ -
53
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
54
+ lr: 0.0004
55
+ -
56
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
57
+ lr: 0.0004
58
+ weight_decay: 0.
59
+ -
60
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
61
+ weight_decay: 0.
62
+
63
+ lr: 0.0008
64
+ betas: [0.9, 0.999]
65
+ weight_decay: 0.0001
66
+
67
+
68
+ # Increase to search for the optimal ema
69
+ epoches: 160 # 148 + 4n
70
+ train_dataloader:
71
+ total_batch_size: 128
72
+ dataset:
73
+ transforms:
74
+ policy:
75
+ epoch: 148
76
+ collate_fn:
77
+ stop_epoch: 148
78
+ ema_restart_decay: 0.9999
79
+ base_size_repeat: ~
80
+
81
+ val_dataloader:
82
+ total_batch_size: 256
configs/deim_dfine/dfine_hgnetv2_s_coco.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./output/dfine_hgnetv2_s_coco
10
+
11
+
12
+ DEIM:
13
+ backbone: HGNetv2
14
+
15
+ HGNetv2:
16
+ name: 'B0'
17
+ return_idx: [1, 2, 3]
18
+ freeze_at: -1
19
+ freeze_norm: False
20
+ use_lab: True
21
+
22
+ DFINETransformer:
23
+ num_layers: 3 # 4 5 6
24
+ eval_idx: -1 # -2 -3 -4
25
+
26
+ HybridEncoder:
27
+ in_channels: [256, 512, 1024]
28
+ hidden_dim: 256
29
+ depth_mult: 0.34
30
+ expansion: 0.5
31
+
32
+ optimizer:
33
+ type: AdamW
34
+ params:
35
+ -
36
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
37
+ lr: 0.0001
38
+ -
39
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
40
+ lr: 0.0001
41
+ weight_decay: 0.
42
+ -
43
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
44
+ weight_decay: 0.
45
+
46
+ lr: 0.0002
47
+ betas: [0.9, 0.999]
48
+ weight_decay: 0.0001
49
+
50
+
51
+ # Increase to search for the optimal ema
52
+ epoches: 132 # 120 + 4n
53
+ train_dataloader:
54
+ dataset:
55
+ transforms:
56
+ policy:
57
+ epoch: 120
58
+ collate_fn:
59
+ stop_epoch: 120
60
+ ema_restart_decay: 0.9999
61
+ base_size_repeat: 20
configs/deim_dfine/dfine_hgnetv2_x_coco.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./output/dfine_hgnetv2_x_coco
10
+
11
+
12
+ DEIM:
13
+ backbone: HGNetv2
14
+
15
+ HGNetv2:
16
+ name: 'B5'
17
+ return_idx: [1, 2, 3]
18
+ freeze_stem_only: True
19
+ freeze_at: 0
20
+ freeze_norm: True
21
+
22
+ HybridEncoder:
23
+ # intra
24
+ hidden_dim: 384
25
+ dim_feedforward: 2048
26
+
27
+ DFINETransformer:
28
+ feat_channels: [384, 384, 384]
29
+ reg_scale: 8
30
+
31
+ optimizer:
32
+ type: AdamW
33
+ params:
34
+ -
35
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
36
+ lr: 0.0000025
37
+ -
38
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
39
+ weight_decay: 0.
40
+
41
+ lr: 0.00025
42
+ betas: [0.9, 0.999]
43
+ weight_decay: 0.000125
44
+
45
+
46
+ # Increase to search for the optimal ema
47
+ epoches: 80 # 72 + 2n
48
+ train_dataloader:
49
+ dataset:
50
+ transforms:
51
+ policy:
52
+ epoch: 72
53
+ collate_fn:
54
+ stop_epoch: 72
55
+ ema_restart_decay: 0.9998
56
+ base_size_repeat: 3
configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './dfine_hgnetv2_x_obj2coco.yml',
3
+ '../../base/deim.yml'
4
+ ]
5
+
6
+ output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e
7
+
8
+ HGNetv2:
9
+ freeze_at: 0 # 0 default
10
+ freeze_norm: True # True default
11
+
12
+ # Activation
13
+ DFINETransformer:
14
+ activation: relu
15
+ mlp_act: relu
16
+
17
+ optimizer:
18
+ type: AdamW
19
+ params:
20
+ -
21
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
22
+ lr: 0.0000025
23
+ -
24
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
25
+ weight_decay: 0.
26
+
27
+ lr: 0.00025
28
+ betas: [0.9, 0.999]
29
+ weight_decay: 0.000125
30
+
31
+ # Increase to search for the optimal ema
32
+ epoches: 24 # 72 + 2n
33
+
34
+ ## Our LR-Scheduler
35
+ lrsheduler: flatcosine
36
+ lr_gamma: 1
37
+ warmup_iter: 0 # 0
38
+ flat_epoch: 12000 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
39
+ no_aug_epoch: 4
40
+
41
+ ## Our DataAug
42
+ train_dataloader:
43
+ dataset:
44
+ transforms:
45
+ policy:
46
+ epoch: [2, 12, 20] # list
47
+
48
+ collate_fn:
49
+ mixup_epochs: [2, 12]
50
+ stop_epoch: 20
configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../../dataset/coco_detection.yml',
3
+ '../../runtime.yml',
4
+ '../../base/dataloader.yml',
5
+ '../../base/optimizer.yml',
6
+ '../../base/dfine_hgnetv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/dfine_hgnetv2_x_obj2coco
10
+
11
+ HGNetv2:
12
+ name: 'B5'
13
+ return_idx: [1, 2, 3]
14
+ freeze_stem_only: True
15
+ freeze_at: 0
16
+ freeze_norm: True
17
+
18
+ HybridEncoder:
19
+ # intra
20
+ hidden_dim: 384
21
+ dim_feedforward: 2048
22
+
23
+ DFINETransformer:
24
+ feat_channels: [384, 384, 384]
25
+ reg_scale: 8
26
+
27
+ optimizer:
28
+ type: AdamW
29
+ params:
30
+ -
31
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
32
+ lr: 0.0000025
33
+ -
34
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
35
+ weight_decay: 0.
36
+
37
+ lr: 0.00025
38
+ betas: [0.9, 0.999]
39
+ weight_decay: 0.000125
40
+
41
+
42
+ epoches: 36 # Early stop
43
+ train_dataloader:
44
+ dataset:
45
+ transforms:
46
+ policy:
47
+ epoch: 30
48
+ collate_fn:
49
+ stop_epoch: 30
50
+ ema_restart_decay: 0.9999
51
+ base_size_repeat: 3
52
+
53
+ ema:
54
+ warmups: 0
55
+
56
+ lr_warmup_scheduler:
57
+ warmup_duration: 0
configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './rtdetrv2_r101vd_6x_coco.yml',
3
+ '../base/rt_deim.yml',
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm).*$'
13
+ lr: 0.000002
14
+ -
15
+ params: '^(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0002
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.0001
21
+
22
+
23
+ # change part
24
+ epoches: 60
25
+ flat_epoch: 34 # 4 + 60 / 2
26
+ no_aug_epoch: 2
27
+
28
+ train_dataloader:
29
+ dataset:
30
+ transforms:
31
+ policy:
32
+ epoch: [4, 34, 58] # list
33
+
34
+ collate_fn:
35
+ mixup_epochs: [4, 34]
36
+ stop_epoch: 58
configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './rtdetrv2_r18vd_120e_coco.yml',
3
+ '../base/rt_deim.yml',
4
+ ]
5
+
6
+ output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*(?:norm|bn)).*$'
13
+ weight_decay: 0.
14
+
15
+ lr: 0.0002
16
+ betas: [0.9, 0.999]
17
+ weight_decay: 0.0001
18
+
19
+ # change part
20
+ epoches: 120
21
+ flat_epoch: 64 # 4 + 120 / 2
22
+ no_aug_epoch: 3
23
+
24
+ train_dataloader:
25
+ dataset:
26
+ transforms:
27
+ policy:
28
+ epoch: [4, 64, 117] # list
29
+
30
+ collate_fn:
31
+ mixup_epochs: [4, 64]
32
+ stop_epoch: 117
configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './rtdetrv2_r34vd_120e_coco.yml',
3
+ '../base/rt_deim.yml',
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm).*$'
13
+ lr: 0.0001
14
+ -
15
+ params: '^(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0002
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.0001
21
+
22
+
23
+ # change part
24
+ epoches: 120
25
+ flat_epoch: 64
26
+ no_aug_epoch: 3
27
+
28
+ train_dataloader:
29
+ dataset:
30
+ transforms:
31
+ policy:
32
+ epoch: [4, 64, 117] # list
33
+
34
+ collate_fn:
35
+ mixup_epochs: [4, 64]
36
+ stop_epoch: 117
configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './rtdetrv2_r50vd_6x_coco.yml',
3
+ '../base/rt_deim.yml',
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco
7
+
8
+ optimizer:
9
+ type: AdamW
10
+ params:
11
+ -
12
+ params: '^(?=.*backbone)(?!.*norm).*$'
13
+ lr: 0.00002
14
+ -
15
+ params: '^(?=.*(?:norm|bn)).*$'
16
+ weight_decay: 0.
17
+
18
+ lr: 0.0002
19
+ betas: [0.9, 0.999]
20
+ weight_decay: 0.0001
21
+
22
+ # change part
23
+ epoches: 60
24
+ flat_epoch: 34 # 4 + 60 / 2
25
+ no_aug_epoch: 2
26
+
27
+ train_dataloader:
28
+ dataset:
29
+ transforms:
30
+ policy:
31
+ epoch: [4, 34, 58] # list
32
+
33
+ collate_fn:
34
+ mixup_epochs: [4, 34]
35
+ stop_epoch: 58
configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ './rtdetrv2_r50vd_m_7x_coco.yml',
3
+ '../base/rt_deim.yml',
4
+ ]
5
+
6
+ output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco
7
+
8
+ RTDETRTransformerv2:
9
+ eval_idx: 2 # use 3th decoder layer to eval
10
+ num_layers: 3
11
+
12
+ optimizer:
13
+ type: AdamW
14
+ params:
15
+ -
16
+ params: '^(?=.*backbone)(?!.*norm).*$'
17
+ lr: 0.00002
18
+ -
19
+ params: '^(?=.*(?:norm|bn)).*$'
20
+ weight_decay: 0.
21
+
22
+ lr: 0.0002
23
+ betas: [0.9, 0.999]
24
+ weight_decay: 0.0001
25
+
26
+ # change part
27
+ epoches: 60
28
+ flat_epoch: 34 # 4 + 60 / 2
29
+ no_aug_epoch: 2
30
+
31
+ train_dataloader:
32
+ dataset:
33
+ transforms:
34
+ policy:
35
+ epoch: [4, 34, 58] # list
36
+
37
+ collate_fn:
38
+ mixup_epochs: [4, 34]
39
+ stop_epoch: 58
configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/rt_optimizer.yml',
6
+ '../base/rtdetrv2_r50vd.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./outputs/rtdetrv2_r101vd_6x_coco
11
+
12
+
13
+ PResNet:
14
+ depth: 101
15
+
16
+
17
+ HybridEncoder:
18
+ # intra
19
+ hidden_dim: 384
20
+ dim_feedforward: 2048
21
+
22
+
23
+ RTDETRTransformerv2:
24
+ feat_channels: [384, 384, 384]
25
+
26
+
27
+ optimizer:
28
+ type: AdamW
29
+ params:
30
+ -
31
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
32
+ lr: 0.000001
33
+ -
34
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # only encoder + decoder norm
35
+ weight_decay: 0.
36
+
37
+ lr: 0.0001
38
+ betas: [0.9, 0.999]
39
+ weight_decay: 0.0001
40
+
configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/rt_optimizer.yml',
6
+ '../base/rtdetrv2_r50vd.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./output/rtdetrv2_r18vd_120e_coco
11
+
12
+
13
+ PResNet:
14
+ depth: 18
15
+ freeze_at: -1
16
+ freeze_norm: False
17
+ pretrained: True
18
+
19
+ HybridEncoder:
20
+ in_channels: [128, 256, 512]
21
+ hidden_dim: 256
22
+ expansion: 0.5
23
+
24
+ RTDETRTransformerv2:
25
+ num_layers: 3
26
+
27
+
28
+ epoches: 120
29
+
30
+ optimizer:
31
+ type: AdamW
32
+ params:
33
+ -
34
+ params: '^(?=.*(?:norm|bn)).*$'
35
+ weight_decay: 0.
36
+
37
+
38
+ train_dataloader:
39
+ dataset:
40
+ transforms:
41
+ policy:
42
+ epoch: 117
43
+ collate_fn:
44
+ scales: ~
configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/rt_optimizer.yml',
6
+ '../base/rtdetrv2_r50vd.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./outputs/rtdetrv2_r34vd_120e_coco
11
+
12
+
13
+ PResNet:
14
+ depth: 34
15
+ freeze_at: -1
16
+ freeze_norm: False
17
+ pretrained: True
18
+
19
+
20
+ HybridEncoder:
21
+ in_channels: [128, 256, 512]
22
+ hidden_dim: 256
23
+ expansion: 0.5
24
+
25
+
26
+ RTDETRTransformerv2:
27
+ num_layers: 4
28
+
29
+
30
+ epoches: 120
31
+
32
+ optimizer:
33
+ type: AdamW
34
+ params:
35
+ -
36
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
37
+ lr: 0.00005
38
+ -
39
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
40
+ lr: 0.00005
41
+ weight_decay: 0.
42
+ -
43
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
44
+ weight_decay: 0.
45
+
46
+ lr: 0.0001
47
+ betas: [0.9, 0.999]
48
+ weight_decay: 0.0001
49
+
50
+
51
+ train_dataloader:
52
+ dataset:
53
+ transforms:
54
+ policy:
55
+ epoch: 117
56
+ collate_fn:
57
+ stop_epoch: 117
configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/rt_optimizer.yml',
6
+ '../base/rtdetrv2_r50vd.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./outputs/rtdetrv2_r50vd_6x_coco
11
+
12
+
13
+ optimizer:
14
+ type: AdamW
15
+ params:
16
+ -
17
+ params: '^(?=.*backbone)(?!.*norm).*$'
18
+ lr: 0.00001
19
+ -
20
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
21
+ weight_decay: 0.
22
+
23
+ lr: 0.0001
24
+ betas: [0.9, 0.999]
25
+ weight_decay: 0.0001
configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/rt_optimizer.yml',
6
+ '../base/rtdetrv2_r50vd.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco
10
+
11
+
12
+ HybridEncoder:
13
+ expansion: 0.5
14
+
15
+
16
+ RTDETRTransformerv2:
17
+ eval_idx: 2 # use 3th decoder layer to eval
18
+
19
+
20
+ epoches: 84
21
+
22
+ optimizer:
23
+ type: AdamW
24
+ params:
25
+ -
26
+ params: '^(?=.*backbone)(?!.*norm).*$'
27
+ lr: 0.00001
28
+ -
29
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
30
+ weight_decay: 0.
31
+
32
+ lr: 0.0001
33
+ betas: [0.9, 0.999]
34
+ weight_decay: 0.0001
35
+
36
+
37
+ train_dataloader:
38
+ dataset:
39
+ transforms:
40
+ policy:
41
+ epoch: 81
42
+ collate_fn:
43
+ stop_epoch: 81
configs/deimv2/deimv2_dinov3_l_coco.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./outputs/deimv2_dinov3_l_coco
11
+
12
+ DEIM:
13
+ backbone: DINOv3STAs
14
+
15
+ DINOv3STAs:
16
+ name: dinov3_vits16
17
+ weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth
18
+ interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32]
19
+ finetune: True
20
+ conv_inplane: 32
21
+ hidden_dim: 224
22
+
23
+ HybridEncoder:
24
+ in_channels: [224, 224, 224]
25
+ hidden_dim: 224
26
+ dim_feedforward: 896
27
+
28
+ DEIMTransformer:
29
+ feat_channels: [224, 224, 224]
30
+ hidden_dim: 224
31
+ num_layers: 4
32
+ eval_idx: -1
33
+ dim_feedforward: 1792
34
+
35
+ ## DEIM LR-Scheduler
36
+ epoches: 68 # 72 + 2n # Increase to search for the optimal ema
37
+
38
+ lrsheduler: flatcosine
39
+ lr_gamma: 0.5
40
+ warmup_iter: 2000
41
+ flat_epoch: 34 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
42
+ no_aug_epoch: 8
43
+
44
+ ## Optimizer
45
+ optimizer:
46
+ type: AdamW
47
+ params:
48
+ -
49
+ # except norm/bn/bias in self.dinov3
50
+ params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
51
+ lr: 0.0000125
52
+ -
53
+ # including norm/bn/bias in self.dinov3
54
+ params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
55
+ lr: 0.0000125
56
+ weight_decay: 0.
57
+ -
58
+ # including norm/bn/bias except for the self.dinov3
59
+ params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
60
+ weight_decay: 0.
61
+
62
+ lr: 0.0005
63
+ betas: [0.9, 0.999]
64
+ weight_decay: 0.000125
65
+
66
+
67
+ ## Dense O2O: Mosaic + Mixup + CopyBlend
68
+ train_dataloader:
69
+ dataset:
70
+ transforms:
71
+ ops:
72
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
73
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
74
+ - {type: RandomPhotometricDistort, p: 0.5}
75
+ - {type: RandomZoomOut, fill: 0}
76
+ - {type: RandomIoUCrop, p: 0.8}
77
+ - {type: SanitizeBoundingBoxes, min_size: 1}
78
+ - {type: RandomHorizontalFlip}
79
+ - {type: Resize, size: [640, 640], }
80
+ - {type: SanitizeBoundingBoxes, min_size: 1}
81
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
82
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
83
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
84
+ policy:
85
+ epoch: [4, 34, 60] # list
86
+
87
+ collate_fn:
88
+ mixup_epochs: [4, 34]
89
+ stop_epoch: 60
90
+ copyblend_epochs: [4, 60]
91
+ base_size_repeat: 3
92
+
93
+ val_dataloader:
94
+ dataset:
95
+ transforms:
96
+ ops:
97
+ - {type: Resize, size: [640, 640], }
98
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
99
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
100
+
101
+ ## DEIM Loss
102
+ DEIMCriterion:
103
+ matcher:
104
+ matcher_change_epoch: 50
configs/deimv2/deimv2_dinov3_m_coco.yml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_dinov3_m_coco
10
+
11
+ DEIM:
12
+ backbone: DINOv3STAs
13
+
14
+ DINOv3STAs:
15
+ name: vit_tinyplus
16
+ embed_dim: 256
17
+ weights_path: ./ckpts/vittplus_distill.pt
18
+ interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32]
19
+ num_heads: 4
20
+
21
+ HybridEncoder:
22
+ in_channels: [256, 256, 256]
23
+ depth_mult: 1
24
+ expansion: 0.67
25
+ hidden_dim: 256
26
+ dim_feedforward: 512
27
+
28
+
29
+ DEIMTransformer:
30
+ feat_channels: [256, 256, 256]
31
+ hidden_dim: 256
32
+ dim_feedforward: 512
33
+ num_layers: 4 # 4 5 6
34
+ eval_idx: -1 # -2 -3 -4
35
+
36
+ optimizer:
37
+ type: AdamW
38
+
39
+ params:
40
+ -
41
+ # except norm/bn/bias in self.dinov3
42
+ params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
43
+ lr: 0.000025
44
+ -
45
+ # including norm/bn/bias in self.dinov3
46
+ params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
47
+ lr: 0.000025
48
+ weight_decay: 0.
49
+ -
50
+ # including norm/bn/bias except for the self.dinov3
51
+ params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
52
+ weight_decay: 0.
53
+
54
+ lr: 0.0005
55
+ betas: [0.9, 0.999]
56
+ weight_decay: 0.0001
57
+
58
+ epoches: 102 # 120 + 4n
59
+
60
+ ## Our LR-Scheduler
61
+ flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
62
+ no_aug_epoch: 12
63
+
64
+
65
+ ## Our DataAug
66
+ train_dataloader:
67
+ dataset:
68
+ transforms:
69
+ ops:
70
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
71
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
72
+ - {type: RandomPhotometricDistort, p: 0.5}
73
+ - {type: RandomZoomOut, fill: 0}
74
+ - {type: RandomIoUCrop, p: 0.8}
75
+ - {type: SanitizeBoundingBoxes, min_size: 1}
76
+ - {type: RandomHorizontalFlip}
77
+ - {type: Resize, size: [640, 640], }
78
+ - {type: SanitizeBoundingBoxes, min_size: 1}
79
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
80
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
81
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
82
+ policy:
83
+ epoch: [4, 49, 90] # list
84
+
85
+ collate_fn:
86
+ mixup_prob: 0.5
87
+ ema_restart_decay: 0.9999
88
+ base_size_repeat: 6
89
+ mixup_epochs: [4, 49]
90
+ stop_epoch: 90
91
+ copyblend_epochs: [4, 90]
92
+
93
+
94
+ val_dataloader:
95
+ dataset:
96
+ transforms:
97
+ ops:
98
+ - {type: Resize, size: [640, 640], }
99
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
100
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
101
+
102
+ DEIMCriterion:
103
+ matcher:
104
+ # new matcher
105
+ change_matcher: True
106
+ iou_order_alpha: 4.0
107
+ matcher_change_epoch: 80
configs/{deimv2_floorplan.yaml → deimv2/deimv2_dinov3_s_coco.yml} RENAMED
@@ -1,58 +1,21 @@
1
  __include__: [
2
- 'coco_detection.yml', # 同じディレクトリ内
3
- # '../configs/runtime.yml', # 存在しない場合はコメントアウト
4
- # '../configs/base/dataloader.yml', # 存在しない場合はコメントアウト
5
- # '../configs/base/optimizer.yml',
6
- # '../configs/base/deimv2.yml', # 存在しない場合はコメントアウト
7
  ]
8
 
9
- output_dir: ./outputs/deimv2_floorplan
10
 
11
- # モデル定義(engine/core.pyが参照する)
12
- model:
13
- type: DEIM
14
- backbone:
15
- type: DINOv3STAs
16
- name: vit_tiny
17
- weights_path: ./ckpts/vitt_distill.pt
18
- interaction_indexes: [3, 7, 11]
19
- num_heads: 3
20
- embed_dim: 192
21
- encoder:
22
- type: HybridEncoder
23
- in_channels: [192, 192, 192]
24
- depth_mult: 0.67
25
- expansion: 0.34
26
- hidden_dim: 192
27
- dim_feedforward: 512
28
- decoder:
29
- type: DEIMTransformer
30
- feat_channels: [192, 192, 192]
31
- hidden_dim: 192
32
- dim_feedforward: 512
33
- num_layers: 4 # 4 5 6
34
- eval_idx: -1 # -2 -3 -4
35
-
36
- # ポストプロセッサ定義(engine/core.pyが参照する)
37
- postprocessor:
38
- type: PostProcessor
39
-
40
- # 互換性のため残す(必要に応じて)
41
  DEIM:
42
  backbone: DINOv3STAs
43
 
44
- Model:
45
- num_classes: 16
46
- class_names: ["kanki", "kanki_shikaku", "kanki_regisuta", "window1", "window2", "door1", "door2", "bathtub1", "konro1", "sink1", "toilet1", "kasaikeihou1", "kasaikeihou2", "houi1", "houi2", "houi3"]
47
-
48
- # eval_spatial_sizeを明示的に設定(推論時の画像サイズ)
49
- eval_spatial_size: [640, 640]
50
-
51
  DINOv3STAs:
52
  name: vit_tiny
53
  embed_dim: 192
54
- weights_path: ./ckpts/vitt_distill.pt # 事前学習を使わないなら行ごと削除
55
- interaction_indexes: [3, 7, 11]
56
  num_heads: 3
57
 
58
  HybridEncoder:
@@ -93,97 +56,53 @@ optimizer:
93
  betas: [0.9, 0.999]
94
  weight_decay: 0.0001
95
 
96
- epoches: 400
97
- flat_epoch: 196
98
- no_aug_epoch: 46
99
-
100
- # optimizer.ymlから必要な設定を手動で追加
101
- use_amp: True
102
- use_ema: True
103
- ema:
104
- type: ModelEMA
105
- decay: 0.9999
106
- warmups: 1000
107
- start: 0
108
-
109
- clip_max_norm: 0.1
110
- sync_bn: True
111
- find_unused_parameters: True
112
-
113
- # 学習率スケジューリング設定
114
- # CosineAnnealingLR専用設定(パラメータを最小限に)
115
- lr_scheduler:
116
- type: CosineAnnealingLR
117
- T_max: 400
118
- eta_min: 0.0000001
119
 
120
- lr_warmup_scheduler:
121
- type: LinearWarmup
122
- warmup_duration: 1000
123
 
124
- # 既存のflatcosineスケジューラーを無効化
125
- lrsheduler: null
126
-
127
- # deimv2.ymlのflatcosineスケジューラーも無効化
128
- lr_gamma: null
129
- warmup_iter: null
130
- flat_epoch: null
131
- no_aug_epoch: null
132
-
133
-
134
- # ---- Data Aug / Loader(図面+640px+OOM対策)----
135
  train_dataloader:
136
- dataset:
137
  transforms:
138
  ops:
139
- # 640でのピーク抑制のためMosaicは確率低め/スケール幅絞り
140
- - {type: Mosaic, output_size: 640, rotation_range: 8, translation_range: [0.1, 0.1],
141
- scaling_range: [0.9, 1.1], probability: 0.2, fill_value: 0, use_cache: True,
142
- max_cached_images: 20, random_pop: True}
143
- - {type: RandomPhotometricDistort, p: 0.2}
144
  - {type: RandomZoomOut, fill: 0}
145
- - {type: RandomIoUCrop, p: 0.6}
146
  - {type: SanitizeBoundingBoxes, min_size: 1}
147
  - {type: RandomHorizontalFlip}
148
- - {type: RandomRotation, degrees: [90, 180, 270, 360], p: 0.5} # 修正版で有効化
149
- - {type: Resize, size: [640, 640]} # ★ 640固定
150
  - {type: SanitizeBoundingBoxes, min_size: 1}
151
  - {type: ConvertPILImage, dtype: 'float32', scale: True}
152
  - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
153
  - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
154
  policy:
155
- epoch: [8, 192, 352] # 400epochに合わせて調整
156
 
157
- collate_fn: # 線画での崩れ防止&メモリ抑制
 
 
158
  ema_restart_decay: 0.9999
159
- base_size_repeat: 1 # ★ 1にして実質マルチスケールOFF
160
- stop_epoch: 352 # 400epochの90%程度で停止
161
- copyblend_epochs: [8, 352] # 400epochに合わせて調整
162
-
163
- # 実装が読む場合のみ有効。読まない場合は base/dataloader.yml ��起動引数で制御
164
- total_batch_size: 4 # ★ まずは 4 に落として安定化
165
 
166
  val_dataloader:
167
  dataset:
168
  transforms:
169
  ops:
170
- - {type: Resize, size: [640, 640]}
171
  - {type: ConvertPILImage, dtype: 'float32', scale: True}
172
  - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
173
- total_batch_size: 6 # 評価も同程度に
174
 
175
  DEIMCriterion:
176
  matcher:
 
177
  change_matcher: True
178
  iou_order_alpha: 4.0
179
- matcher_change_epoch: 300
180
- gamma: 1.5
181
- alpha: 0.75
182
- weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
183
- losses: [mal, boxes, local]
184
-
185
- # 出力設定 - 最後のエポック必ず保存
186
- output:
187
- save_last: true
188
- save_interval: 5 # チェックポイント保存間隔
189
- checkpoint_freq: 5 # 学習ループでの保存頻度
 
1
  __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
  ]
8
 
9
+ output_dir: ./outputs/deimv2_dinov3_s_coco
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  DEIM:
12
  backbone: DINOv3STAs
13
 
 
 
 
 
 
 
 
14
  DINOv3STAs:
15
  name: vit_tiny
16
  embed_dim: 192
17
+ weights_path: ./ckpts/vitt_distill.pt
18
+ interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32]
19
  num_heads: 3
20
 
21
  HybridEncoder:
 
56
  betas: [0.9, 0.999]
57
  weight_decay: 0.0001
58
 
59
+ # Increase to search for the optimal ema
60
+ epoches: 132 # 120 + 4n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ ## Our LR-Scheduler
63
+ flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
64
+ no_aug_epoch: 12
65
 
66
+ ## Our DataAug
 
 
 
 
 
 
 
 
 
 
67
  train_dataloader:
68
+ dataset:
69
  transforms:
70
  ops:
71
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
72
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
73
+ - {type: RandomPhotometricDistort, p: 0.5}
 
 
74
  - {type: RandomZoomOut, fill: 0}
75
+ - {type: RandomIoUCrop, p: 0.8}
76
  - {type: SanitizeBoundingBoxes, min_size: 1}
77
  - {type: RandomHorizontalFlip}
78
+ - {type: Resize, size: [640, 640], }
 
79
  - {type: SanitizeBoundingBoxes, min_size: 1}
80
  - {type: ConvertPILImage, dtype: 'float32', scale: True}
81
  - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
82
  - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
83
  policy:
84
+ epoch: [4, 64, 120] # list
85
 
86
+ collate_fn:
87
+ base_size: 640
88
+ mixup_prob: 0.5
89
  ema_restart_decay: 0.9999
90
+ base_size_repeat: 20
91
+ mixup_epochs: [4, 64]
92
+ stop_epoch: 120
93
+ copyblend_epochs: [4, 120]
 
 
94
 
95
  val_dataloader:
96
  dataset:
97
  transforms:
98
  ops:
99
+ - {type: Resize, size: [640, 640], }
100
  - {type: ConvertPILImage, dtype: 'float32', scale: True}
101
  - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
 
102
 
103
  DEIMCriterion:
104
  matcher:
105
+ # change matcher
106
  change_matcher: True
107
  iou_order_alpha: 4.0
108
+ matcher_change_epoch: 100
 
 
 
 
 
 
 
 
 
 
configs/deimv2/deimv2_dinov3_x_coco.yml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+
10
+ output_dir: ./outputs/deimv2_dinov3_x_coco
11
+
12
+ DEIM:
13
+ backbone: DINOv3STAs
14
+
15
+ DINOv3STAs:
16
+ name: dinov3_vits16plus
17
+ weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth
18
+ interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32]
19
+ finetune: True
20
+ conv_inplane: 64
21
+ hidden_dim: 256
22
+
23
+ HybridEncoder:
24
+ in_channels: [256, 256, 256]
25
+ # intra
26
+ hidden_dim: 256
27
+ dim_feedforward: 1024
28
+
29
+ # cross
30
+ expansion: 1.25
31
+ depth_mult: 1.37
32
+
33
+ DEIMTransformer:
34
+ num_layers: 6
35
+ eval_idx: -1
36
+ feat_channels: [256, 256, 256]
37
+ # reg_scale: 8
38
+ hidden_dim: 256
39
+ dim_feedforward: 2048
40
+
41
+ optimizer:
42
+ type: AdamW
43
+ params:
44
+ -
45
+ # except norm/bn/bias in self.dinov3
46
+ params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
47
+ lr: 0.00001
48
+ -
49
+ # including norm/bn/bias in self.dinov3
50
+ params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
51
+ lr: 0.00001
52
+ weight_decay: 0.
53
+ -
54
+ # including norm/bn/bias except for the self.dinov3
55
+ params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
56
+ weight_decay: 0.
57
+
58
+ lr: 0.0005
59
+ betas: [0.9, 0.999]
60
+ weight_decay: 0.000125
61
+
62
+ ## Dense O2O: Mosaic + Mixup + CopyBlend
63
+ train_dataloader:
64
+ dataset:
65
+ transforms:
66
+ ops:
67
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
68
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
69
+ - {type: RandomPhotometricDistort, p: 0.5}
70
+ - {type: RandomZoomOut, fill: 0}
71
+ - {type: RandomIoUCrop, p: 0.8}
72
+ - {type: SanitizeBoundingBoxes, min_size: 1}
73
+ - {type: RandomHorizontalFlip}
74
+ - {type: Resize, size: [640, 640], }
75
+ - {type: SanitizeBoundingBoxes, min_size: 1}
76
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
77
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
78
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
79
+ policy:
80
+ epoch: [4, 29, 50] # list
81
+
82
+ collate_fn:
83
+ mixup_epochs: [4, 29]
84
+ stop_epoch: 50
85
+ copyblend_epochs: [4, 50]
86
+ base_size_repeat: 3
87
+
88
+ val_dataloader:
89
+ dataset:
90
+ transforms:
91
+ ops:
92
+ - {type: Resize, size: [640, 640], }
93
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
94
+ - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
configs/deimv2/deimv2_hgnetv2_atto_coco.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_atto_coco
10
+
11
+ DEIM:
12
+ encoder: LiteEncoder
13
+
14
+ HGNetv2:
15
+ name: 'Atto'
16
+ return_idx: [2]
17
+ freeze_at: -1
18
+ freeze_norm: False
19
+ use_lab: True
20
+
21
+ LiteEncoder:
22
+ in_channels: [256]
23
+ feat_strides: [16]
24
+ # intra
25
+ hidden_dim: 64
26
+
27
+ # cross
28
+ expansion: 0.34
29
+ depth_mult: 0.5
30
+ act: 'silu'
31
+
32
+
33
+ DEIMTransformer:
34
+ feat_channels: [64, 64]
35
+ feat_strides: [16, 32]
36
+ hidden_dim: 64
37
+ num_levels: 2
38
+ num_points: [4, 2]
39
+
40
+ num_layers: 3
41
+ eval_idx: -1
42
+ num_queries: 100
43
+
44
+ # FFN
45
+ dim_feedforward: 160
46
+
47
+ # New options for DEIMv2
48
+ share_bbox_head: True
49
+ use_gateway: False
50
+
51
+ # Increase to search for the optimal ema
52
+ epoches: 500 # 468 + 32
53
+
54
+ ## Our LR-Scheduler
55
+ warmup_iter: 4000
56
+ flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
57
+ no_aug_epoch: 32
58
+ lr_gamma: 0.5
59
+
60
+ optimizer:
61
+ type: AdamW
62
+ params:
63
+ - params: '^(?=.*backbone)(?!.*norm|bn).*$'
64
+ lr: 0.001
65
+ - params: '^(?=.*backbone)(?=.*norm|bn).*$'
66
+ lr: 0.001
67
+ weight_decay: 0.
68
+ - params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # except bias
69
+ weight_decay: 0.
70
+
71
+ lr: 0.002
72
+ betas: [0.9, 0.999]
73
+ weight_decay: 0.0001
74
+
75
+ eval_spatial_size: [320, 320]
76
+ train_dataloader:
77
+ total_batch_size: 128
78
+ dataset:
79
+ transforms:
80
+ ops:
81
+ - {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
82
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
83
+ - {type: RandomPhotometricDistort, p: 0.5}
84
+ - {type: RandomZoomOut, fill: 0}
85
+ - {type: RandomIoUCrop, p: 0.8}
86
+ - {type: SanitizeBoundingBoxes, min_size: 12}
87
+ - {type: RandomHorizontalFlip}
88
+ - {type: Resize, size: [320, 320], }
89
+ - {type: SanitizeBoundingBoxes, min_size: 12}
90
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
91
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
92
+ policy:
93
+ epoch: [4, 250, 400] # list
94
+ mosaic_prob: 0.3
95
+
96
+ collate_fn:
97
+ mixup_prob: 0.0
98
+ mixup_epochs: [40000, 15000]
99
+ copyblend_prob: 0.0
100
+ copyblend_epochs: [40000, 15000]
101
+
102
+ stop_epoch: 468 # 468 + 32
103
+ ema_restart_decay: 0.9999
104
+ base_size: 320
105
+ base_size_repeat: ~
106
+
107
+ val_dataloader:
108
+ total_batch_size: 256
109
+ dataset:
110
+ transforms:
111
+ ops:
112
+ - {type: Resize, size: [320, 320], }
113
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
114
+ shuffle: False
115
+ num_workers: 16
116
+
117
+
118
+ DEIMCriterion:
119
+ losses: ['mal', 'boxes'] # , 'local'
120
+ use_uni_set: False
121
+
122
+ matcher:
123
+ matcher_change_epoch: 450 # FIX This
configs/deimv2/deimv2_hgnetv2_femto_coco.yml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_femto_coco
10
+
11
+ DEIM:
12
+ encoder: LiteEncoder
13
+
14
+ HGNetv2:
15
+ name: 'Femto'
16
+ return_idx: [2]
17
+ freeze_at: -1
18
+ freeze_norm: False
19
+ use_lab: True
20
+
21
+ LiteEncoder:
22
+ in_channels: [512]
23
+ feat_strides: [16]
24
+
25
+ # intra
26
+ hidden_dim: 96
27
+
28
+ # cross
29
+ expansion: 0.34
30
+ depth_mult: 0.5
31
+ act: 'silu'
32
+
33
+
34
+ DEIMTransformer:
35
+ feat_channels: [96, 96]
36
+ feat_strides: [16, 32]
37
+ hidden_dim: 96
38
+ num_levels: 2
39
+ num_points: [4, 2]
40
+
41
+ num_layers: 3
42
+ eval_idx: -1
43
+ num_queries: 150
44
+
45
+ # FFN
46
+ dim_feedforward: 256
47
+
48
+ # New options for DEIMv2
49
+ share_bbox_head: True
50
+ use_gateway: False
51
+
52
+ # Increase to search for the optimal ema
53
+ epoches: 500 # 468 + 32
54
+
55
+ ## Our LR-Scheduler
56
+ warmup_iter: 4000
57
+ flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
58
+ no_aug_epoch: 32
59
+ lr_gamma: 0.5
60
+
61
+ optimizer:
62
+ type: AdamW
63
+ params:
64
+ -
65
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
66
+ lr: 0.0008
67
+ -
68
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
69
+ lr: 0.0008
70
+ weight_decay: 0.
71
+ - # not opt
72
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
73
+ weight_decay: 0.
74
+
75
+ lr: 0.0016
76
+ betas: [0.9, 0.999]
77
+ weight_decay: 0.0001
78
+
79
+ eval_spatial_size: [416, 416]
80
+ train_dataloader:
81
+ total_batch_size: 128
82
+ dataset:
83
+ transforms:
84
+ ops:
85
+ - {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
86
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
87
+ - {type: RandomPhotometricDistort, p: 0.5}
88
+ - {type: RandomZoomOut, fill: 0}
89
+ - {type: RandomIoUCrop, p: 0.8}
90
+ - {type: SanitizeBoundingBoxes, min_size: 10}
91
+ - {type: RandomHorizontalFlip}
92
+ - {type: Resize, size: [416, 416], }
93
+ - {type: SanitizeBoundingBoxes, min_size: 10}
94
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
95
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
96
+ policy:
97
+ epoch: [4, 250, 400] # list
98
+ ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
99
+ mosaic_prob: 0.5
100
+
101
+ collate_fn:
102
+ mixup_prob: 0.0
103
+ mixup_epochs: [40000, 15000]
104
+ copyblend_prob: 0.0
105
+ copyblend_epochs: [40000, 15000]
106
+
107
+ stop_epoch: 468 # 468 + 32
108
+ ema_restart_decay: 0.9999
109
+ base_size: 416
110
+ base_size_repeat: ~
111
+
112
+ val_dataloader:
113
+ total_batch_size: 256
114
+ dataset:
115
+ transforms:
116
+ ops:
117
+ - {type: Resize, size: [416, 416], }
118
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
119
+ shuffle: False
120
+ num_workers: 16
121
+
122
+
123
+ DEIMCriterion:
124
+ losses: ['mal', 'boxes'] # , 'local'
125
+ use_uni_set: False
126
+
127
+ matcher:
128
+ matcher_change_epoch: 450 # FIX This
configs/deimv2/deimv2_hgnetv2_l_coco.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml'
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_l_coco
10
+
11
+
12
+ optimizer:
13
+ type: AdamW
14
+ params:
15
+ -
16
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
17
+ lr: 0.000025
18
+ -
19
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
20
+ weight_decay: 0.
21
+
22
+ lr: 0.0005
23
+ betas: [0.9, 0.999]
24
+ weight_decay: 0.000125
configs/deimv2/deimv2_hgnetv2_m_coco.yml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml'
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_m_coco
10
+
11
+ HGNetv2:
12
+ name: 'B2'
13
+ return_idx: [1, 2, 3]
14
+ freeze_at: -1
15
+ freeze_norm: False
16
+ use_lab: True
17
+
18
+ HybridEncoder:
19
+ in_channels: [384, 768, 1536]
20
+ hidden_dim: 256
21
+ depth_mult: 0.67
22
+
23
+ DEIMTransformer:
24
+ num_layers: 4 # 5 6
25
+ eval_idx: -1 # -2 -3
26
+
27
+ optimizer:
28
+ type: AdamW
29
+ params:
30
+ -
31
+ params: '^(?=.*backbone)(?!.*bn).*$'
32
+ lr: 0.00004
33
+ -
34
+ params: '^(?=.*(?:norm|bn)).*$'
35
+ weight_decay: 0.
36
+
37
+ lr: 0.0004
38
+ betas: [0.9, 0.999]
39
+ weight_decay: 0.0001
40
+
41
+ # Increase to search for the optimal ema
42
+ epoches: 102 # 120 + 4n
43
+
44
+ ## Our LR-Scheduler
45
+ flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
46
+ no_aug_epoch: 12
47
+
48
+ ## Our DataAug
49
+ train_dataloader:
50
+ dataset:
51
+ transforms:
52
+ policy:
53
+ epoch: [4, 49, 90] # list
54
+
55
+ collate_fn:
56
+ ema_restart_decay: 0.9999
57
+ base_size_repeat: 6
58
+ mixup_epochs: [4, 49]
59
+ stop_epoch: 90
60
+ copyblend_prob: 0.5
61
+ copyblend_epochs: [4, 90]
62
+ area_threshold: 100
63
+ num_objects: 3
64
+ with_expand: True
65
+ expand_ratios: [0.1, 0.25]
66
+
67
+ DEIMCriterion:
68
+ matcher:
69
+ # new matcher
70
+ change_matcher: True
71
+ iou_order_alpha: 4.0
72
+ matcher_change_epoch: 80
configs/deimv2/deimv2_hgnetv2_n_coco.yml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml'
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_n_coco
10
+
11
+ HGNetv2:
12
+ name: 'B0'
13
+ return_idx: [2, 3]
14
+ freeze_at: -1
15
+ freeze_norm: False
16
+ use_lab: True
17
+
18
+ HybridEncoder:
19
+ in_channels: [512, 1024]
20
+ feat_strides: [16, 32]
21
+
22
+ # intra
23
+ hidden_dim: 128
24
+ use_encoder_idx: [1]
25
+ dim_feedforward: 512
26
+
27
+ # cross
28
+ expansion: 0.34
29
+ depth_mult: 0.5
30
+
31
+ version: 'dfine'
32
+
33
+ DEIMTransformer:
34
+ feat_channels: [128, 128]
35
+ feat_strides: [16, 32]
36
+ hidden_dim: 128
37
+ num_levels: 2
38
+ num_points: [6, 6]
39
+
40
+ num_layers: 3
41
+ eval_idx: -1
42
+
43
+ # FFN
44
+ dim_feedforward: 512
45
+
46
+ optimizer:
47
+ type: AdamW
48
+ params:
49
+ -
50
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
51
+ lr: 0.0004
52
+ -
53
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
54
+ lr: 0.0004
55
+ weight_decay: 0.
56
+ -
57
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
58
+ weight_decay: 0.
59
+
60
+ lr: 0.0008
61
+ betas: [0.9, 0.999]
62
+ weight_decay: 0.0001
63
+
64
+ # Increase to search for the optimal ema
65
+ epoches: 160 # 148 + 12
66
+
67
+ ## Our LR-Scheduler
68
+ flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
69
+ no_aug_epoch: 12
70
+ lr_gamma: 1.0
71
+
72
+ ## Our DataAug
73
+ train_dataloader:
74
+ dataset:
75
+ transforms:
76
+ policy:
77
+ epoch: [4, 78, 148] # list
78
+
79
+ collate_fn:
80
+ ema_restart_decay: 0.9999
81
+ base_size_repeat: ~
82
+ mixup_epochs: [4, 78]
83
+ stop_epoch: 148
84
+ copyblend_prob: 0.4
85
+ copyblend_epochs: [4, 78] # CP half
86
+ area_threshold: 100
87
+ num_objects: 3
88
+ with_expand: True
89
+ expand_ratios: [0.1, 0.25]
90
+
91
+ DEIMCriterion:
92
+ matcher:
93
+ # new matcher
94
+ change_matcher: True
95
+ iou_order_alpha: 4.0
96
+ matcher_change_epoch: 136
configs/deimv2/deimv2_hgnetv2_pico_coco.yml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml',
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_pico_coco
10
+
11
+ DEIM:
12
+ encoder: LiteEncoder
13
+ decoder: DEIMTransformer
14
+
15
+ HGNetv2:
16
+ name: 'Pico'
17
+ return_idx: [2]
18
+ freeze_at: -1
19
+ freeze_norm: False
20
+ use_lab: True
21
+
22
+ LiteEncoder:
23
+ in_channels: [512]
24
+ feat_strides: [16]
25
+
26
+ # intra
27
+ hidden_dim: 112
28
+
29
+ # cross
30
+ expansion: 0.34
31
+ depth_mult: 0.5
32
+ act: 'silu'
33
+
34
+
35
+ DEIMTransformer:
36
+ feat_channels: [112, 112]
37
+ feat_strides: [16, 32]
38
+ hidden_dim: 112
39
+ num_levels: 2
40
+ num_points: [4, 2]
41
+
42
+ num_layers: 3
43
+ eval_idx: -1
44
+ num_queries: 200
45
+
46
+ # FFN
47
+ dim_feedforward: 320
48
+
49
+ # New options for DEIMv2
50
+ share_bbox_head: True
51
+ use_gateway: False
52
+
53
+ # Increase to search for the optimal ema
54
+ epoches: 500 # 468 + 32
55
+
56
+ ## Our LR-Scheduler
57
+ warmup_iter: 4000
58
+ flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
59
+ no_aug_epoch: 32
60
+ lr_gamma: 0.5
61
+
62
+ optimizer:
63
+ type: AdamW
64
+ params:
65
+ -
66
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
67
+ lr: 0.0008
68
+ -
69
+ params: '^(?=.*backbone)(?=.*norm|bn).*$'
70
+ lr: 0.0008
71
+ weight_decay: 0.
72
+ - # not opt
73
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
74
+ weight_decay: 0.
75
+
76
+ lr: 0.0016
77
+ betas: [0.9, 0.999]
78
+ weight_decay: 0.0001
79
+
80
+ eval_spatial_size: [640, 640]
81
+ train_dataloader:
82
+ total_batch_size: 128
83
+ dataset:
84
+ transforms:
85
+ ops:
86
+ - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
87
+ probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
88
+ - {type: RandomPhotometricDistort, p: 0.5}
89
+ - {type: RandomZoomOut, fill: 0}
90
+ - {type: RandomIoUCrop, p: 0.8}
91
+ - {type: SanitizeBoundingBoxes, min_size: 8}
92
+ - {type: RandomHorizontalFlip}
93
+ - {type: Resize, size: [640, 640], }
94
+ - {type: SanitizeBoundingBoxes, min_size: 8}
95
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
96
+ - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
97
+ policy:
98
+ epoch: [4, 250, 400] # list
99
+ ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
100
+ mosaic_prob: 0.5
101
+
102
+ collate_fn:
103
+ mixup_prob: 0.0
104
+ mixup_epochs: [40000, 15000]
105
+ copyblend_prob: 0.0
106
+ copyblend_epochs: [40000, 15000]
107
+ stop_epoch: 468 # 468 + 32
108
+ ema_restart_decay: 0.9999
109
+ base_size: 640
110
+ base_size_repeat: ~
111
+
112
+ val_dataloader:
113
+ total_batch_size: 256
114
+ dataset:
115
+ transforms:
116
+ ops:
117
+ - {type: Resize, size: [640, 640], }
118
+ - {type: ConvertPILImage, dtype: 'float32', scale: True}
119
+ shuffle: False
120
+ num_workers: 16
121
+
122
+
123
+ DEIMCriterion:
124
+ losses: ['mal', 'boxes'] # , 'local'
125
+ use_uni_set: False
126
+
127
+ matcher:
128
+ matcher_change_epoch: 450 # FIX This
configs/deimv2/deimv2_hgnetv2_s_coco.yml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml'
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_s_coco
10
+
11
+ HGNetv2:
12
+ name: 'B0'
13
+ return_idx: [1, 2, 3]
14
+ freeze_at: -1
15
+ freeze_norm: False
16
+ use_lab: True
17
+
18
+ HybridEncoder:
19
+ in_channels: [256, 512, 1024]
20
+ hidden_dim: 256
21
+ depth_mult: 0.34
22
+ expansion: 0.5
23
+
24
+ version: 'dfine'
25
+
26
+ DEIMTransformer:
27
+ num_layers: 3 # 4 5 6
28
+ eval_idx: -1 # -2 -3 -4
29
+
30
+ optimizer:
31
+ type: AdamW
32
+ params:
33
+ -
34
+ params: '^(?=.*backbone)(?!.*bn).*$'
35
+ lr: 0.0002
36
+ -
37
+ params: '^(?=.*(?:norm|bn)).*$' # except bias
38
+ weight_decay: 0.
39
+
40
+ lr: 0.0004
41
+ betas: [0.9, 0.999]
42
+ weight_decay: 0.0001
43
+
44
+ # Increase to search for the optimal ema
45
+ epoches: 132 # 120 + 4n
46
+
47
+ ## Our LR-Scheduler
48
+ flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
49
+ no_aug_epoch: 12
50
+
51
+ ## Our DataAug
52
+ train_dataloader:
53
+ dataset:
54
+ transforms:
55
+ policy:
56
+ epoch: [4, 64, 120] # list
57
+
58
+ collate_fn:
59
+ ema_restart_decay: 0.9999
60
+ base_size_repeat: 20
61
+ mixup_epochs: [4, 64]
62
+ stop_epoch: 120
63
+ copyblend_prob: 0.5
64
+ # copyblend_epochs: [4, 64] # from v11 to v12: copy-paste continues only half epochs
65
+ copyblend_epochs: [4, 120]
66
+ area_threshold: 100
67
+ num_objects: 3
68
+ with_expand: True
69
+ expand_ratios: [0.1, 0.25]
70
+
71
+ DEIMCriterion:
72
+ matcher:
73
+ # new matcher
74
+ change_matcher: True
75
+ iou_order_alpha: 4.0
76
+ matcher_change_epoch: 100
configs/deimv2/deimv2_hgnetv2_x_coco.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __include__: [
2
+ '../dataset/coco_detection.yml',
3
+ '../runtime.yml',
4
+ '../base/dataloader.yml',
5
+ '../base/optimizer.yml',
6
+ '../base/deimv2.yml'
7
+ ]
8
+
9
+ output_dir: ./outputs/deimv2_hgnetv2_x_coco
10
+
11
+
12
+ HGNetv2:
13
+ name: 'B5'
14
+ return_idx: [1, 2, 3]
15
+ freeze_stem_only: True
16
+ freeze_at: 0
17
+ freeze_norm: True
18
+
19
+ HybridEncoder:
20
+ # intra
21
+ hidden_dim: 384
22
+ dim_feedforward: 2048
23
+
24
+ DEIMTransformer:
25
+ feat_channels: [384, 384, 384] # [256, 256, 256]
26
+ reg_scale: 8 # 4
27
+
28
+ # FFN
29
+ dim_feedforward: 2048
30
+
31
+ optimizer:
32
+ type: AdamW
33
+ params:
34
+ -
35
+ params: '^(?=.*backbone)(?!.*norm|bn).*$'
36
+ lr: 0.000005
37
+ -
38
+ params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
39
+ weight_decay: 0.
40
+
41
+ lr: 0.0005
42
+ betas: [0.9, 0.999]
43
+ weight_decay: 0.000125
44
+
45
+ # Increase to search for the optimal ema
46
+ epoches: 58 # 72 + 2n
47
+
48
+ ## Our LR-Scheduler
49
+ flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
50
+ no_aug_epoch: 8
51
+
52
+ train_dataloader:
53
+ dataset:
54
+ transforms:
55
+ policy:
56
+ epoch: [4, 29, 50] # list
57
+
58
+ collate_fn:
59
+ ema_restart_decay: 0.9998
60
+ base_size_repeat: 3
configs/runtime.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print_freq: 500
2
+ output_dir: './logs'
3
+ checkpoint_freq: 12
4
+
5
+
6
+ sync_bn: True
7
+ find_unused_parameters: True
8
+
9
+
10
+ use_amp: False
11
+ scaler:
12
+ type: GradScaler
13
+ enabled: True
14
+
15
+
16
+ use_ema: False
17
+ ema:
18
+ type: ModelEMA
19
+ decay: 0.9999
20
+ warmups: 1000
engine/__init__.py CHANGED
@@ -1,13 +1,16 @@
1
- # engine package
2
- # モジュールをインポートしてレジストリに登録
3
- from . import backbone
4
- from . import deim
5
- from . import data
6
- from . import optim
7
- from . import misc
8
 
9
- # YAMLConfigをエクスポート
10
- from .core.yaml_config import YAMLConfig
 
 
11
 
12
- __all__ = ['YAMLConfig']
13
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
3
+ """
 
 
 
 
4
 
5
+ # for register purpose
6
+ from . import optim
7
+ from . import data
8
+ from . import deim
9
 
10
+ from .backbone import *
11
 
12
+ from .backbone import (
13
+ get_activation,
14
+ FrozenBatchNorm2d,
15
+ freeze_batch_norm2d,
16
+ )
engine/backbone/vit_tiny.py CHANGED
@@ -6,14 +6,16 @@ Modified from DINOv3 (https://github.com/facebookresearch/dinov3)
6
  Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py
7
 
8
  """
 
 
 
 
 
 
9
  import torch
10
  import torch.nn as nn
11
  import torch.nn.functional as F
12
- from functools import partial
13
- import math
14
- import numpy as np
15
- import warnings
16
- from typing import Literal, Tuple
17
 
18
 
19
  class RopePositionEmbedding(nn.Module):
@@ -180,11 +182,11 @@ class Attention(nn.Module):
180
  head_dim = dim // num_heads
181
  self.scale = head_dim ** -0.5
182
  self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
183
- self.attn_drop = nn.Dropout(attn_drop)
184
  self.proj = nn.Linear(dim, dim)
185
  self.proj_drop = nn.Dropout(proj_drop)
186
 
187
- def forward(self, x, rope_sincos=None, register_hook=False):
188
  B, N, C = x.shape
189
  qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
190
  q, k, v = qkv.unbind(0)
@@ -200,13 +202,8 @@ class Attention(nn.Module):
200
  q = torch.cat((q_cls, q_patch), dim=2)
201
  k = torch.cat((k_cls, k_patch), dim=2)
202
 
203
- attn = (q @ k.transpose(-2, -1)) * self.scale
204
- attn = attn.softmax(dim=-1)
205
- attn = self.attn_drop(attn)
206
-
207
- if register_hook: attn.register_hook(self.save_attn_gradients)
208
-
209
- x = (attn @ v).transpose(1, 2).reshape(B, N, C)
210
  x = self.proj(x)
211
  x = self.proj_drop(x)
212
  return x
@@ -220,8 +217,8 @@ class Block(nn.Module):
220
  self.norm2 = norm_layer(dim)
221
  self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
222
 
223
- def forward(self, x, rope_sincos=None, register_hook=False):
224
- attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos, register_hook=register_hook)
225
  x = x + self.drop_path(attn_output)
226
  x = x + self.drop_path(self.mlp(self.norm2(x)))
227
  return x
@@ -260,7 +257,6 @@ class VisionTransformer(nn.Module):
260
  normalize_coords="separate", shift_coords=None, jitter_coords=None,
261
  rescale_coords=None, dtype=None, device=None,
262
  )
263
-
264
  self.init_weights()
265
 
266
  def init_weights(self):
@@ -286,28 +282,7 @@ class VisionTransformer(nn.Module):
286
  def feature_dim(self):
287
  return self.embed_dim
288
 
289
- def forward_features(self, x, register_hook=False):
290
- B, C, H, W = x.shape
291
-
292
- x_embed = self._model.patch_embed(x)
293
- cls_token = self._model.cls_token.expand(x_embed.shape[0], -1, -1)
294
- x = torch.cat((cls_token, x_embed), dim=1)
295
-
296
- patch_grid_h = H // self.patch_size
297
- patch_grid_w = W // self.patch_size
298
- rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
299
-
300
- for blk in self._model.blocks:
301
- x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook)
302
- x = x[:, 1:, :]
303
- return {'features': x.transpose(1, 2).reshape(-1, self.embed_dim, patch_grid_h, patch_grid_w)}
304
-
305
- def forward_pool(self, x):
306
- features = self.forward_features(x)['features']
307
- pooled_features = features.mean(dim=[2, 3])
308
- return {'pooled_features': pooled_features}
309
-
310
- def forward(self, x, register_hook=False):
311
  outs = []
312
  B, C, H, W = x.shape
313
 
@@ -320,7 +295,7 @@ class VisionTransformer(nn.Module):
320
  rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
321
 
322
  for i, blk in enumerate(self._model.blocks):
323
- x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook)
324
  if i in self.return_layers:
325
  outs.append((x[:, 1:], x[:, 0]))
326
  return outs
 
6
  Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py
7
 
8
  """
9
+ import math
10
+ import warnings
11
+ from functools import partial
12
+ from typing import List, Literal, Tuple
13
+
14
+ import numpy as np
15
  import torch
16
  import torch.nn as nn
17
  import torch.nn.functional as F
18
+ from torch import nn
 
 
 
 
19
 
20
 
21
  class RopePositionEmbedding(nn.Module):
 
182
  head_dim = dim // num_heads
183
  self.scale = head_dim ** -0.5
184
  self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
185
+ self.attn_drop = attn_drop
186
  self.proj = nn.Linear(dim, dim)
187
  self.proj_drop = nn.Dropout(proj_drop)
188
 
189
+ def forward(self, x, rope_sincos=None):
190
  B, N, C = x.shape
191
  qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
192
  q, k, v = qkv.unbind(0)
 
202
  q = torch.cat((q_cls, q_patch), dim=2)
203
  k = torch.cat((k_cls, k_patch), dim=2)
204
 
205
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop)
206
+ x = x.transpose(1, 2).reshape([B, N, C])
 
 
 
 
 
207
  x = self.proj(x)
208
  x = self.proj_drop(x)
209
  return x
 
217
  self.norm2 = norm_layer(dim)
218
  self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
219
 
220
+ def forward(self, x, rope_sincos=None):
221
+ attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos)
222
  x = x + self.drop_path(attn_output)
223
  x = x + self.drop_path(self.mlp(self.norm2(x)))
224
  return x
 
257
  normalize_coords="separate", shift_coords=None, jitter_coords=None,
258
  rescale_coords=None, dtype=None, device=None,
259
  )
 
260
  self.init_weights()
261
 
262
  def init_weights(self):
 
282
  def feature_dim(self):
283
  return self.embed_dim
284
 
285
+ def forward(self, x):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  outs = []
287
  B, C, H, W = x.shape
288
 
 
295
  rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
296
 
297
  for i, blk in enumerate(self._model.blocks):
298
+ x = blk(x, rope_sincos=rope_sincos)
299
  if i in self.return_layers:
300
  outs.append((x[:, 1:], x[:, 0]))
301
  return outs