Spaces:
Running
on
Zero
Running
on
Zero
first
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- configs/base/dataloader.yml +39 -0
- configs/base/deim.yml +48 -0
- configs/base/deimv2.yml +144 -0
- configs/base/dfine_hgnetv2.yml +90 -0
- configs/base/optimizer.yml +35 -0
- configs/base/rt_deim.yml +49 -0
- configs/base/rt_optimizer.yml +37 -0
- configs/base/rtdetrv2_r50vd.yml +90 -0
- configs/{coco_detection.yml → dataset/coco_detection.yml} +0 -0
- configs/dataset/crowdhuman_detection.yml +41 -0
- configs/dataset/custom_detection.yml +41 -0
- configs/dataset/obj365_detection.yml +41 -0
- configs/dataset/voc_detection.yml +40 -0
- configs/deim_dfine/deim_hgnetv2_l_coco.yml +37 -0
- configs/deim_dfine/deim_hgnetv2_m_coco.yml +39 -0
- configs/deim_dfine/deim_hgnetv2_n_coco.yml +44 -0
- configs/deim_dfine/deim_hgnetv2_s_coco.yml +39 -0
- configs/deim_dfine/deim_hgnetv2_x_coco.yml +37 -0
- configs/deim_dfine/dfine_hgnetv2_l_coco.yml +44 -0
- configs/deim_dfine/dfine_hgnetv2_m_coco.yml +60 -0
- configs/deim_dfine/dfine_hgnetv2_n_coco.yml +82 -0
- configs/deim_dfine/dfine_hgnetv2_s_coco.yml +61 -0
- configs/deim_dfine/dfine_hgnetv2_x_coco.yml +56 -0
- configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml +50 -0
- configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml +57 -0
- configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml +36 -0
- configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml +32 -0
- configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml +36 -0
- configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml +35 -0
- configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml +39 -0
- configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml +40 -0
- configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml +44 -0
- configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml +57 -0
- configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml +25 -0
- configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml +43 -0
- configs/deimv2/deimv2_dinov3_l_coco.yml +104 -0
- configs/deimv2/deimv2_dinov3_m_coco.yml +107 -0
- configs/{deimv2_floorplan.yaml → deimv2/deimv2_dinov3_s_coco.yml} +31 -112
- configs/deimv2/deimv2_dinov3_x_coco.yml +94 -0
- configs/deimv2/deimv2_hgnetv2_atto_coco.yml +123 -0
- configs/deimv2/deimv2_hgnetv2_femto_coco.yml +128 -0
- configs/deimv2/deimv2_hgnetv2_l_coco.yml +24 -0
- configs/deimv2/deimv2_hgnetv2_m_coco.yml +72 -0
- configs/deimv2/deimv2_hgnetv2_n_coco.yml +96 -0
- configs/deimv2/deimv2_hgnetv2_pico_coco.yml +128 -0
- configs/deimv2/deimv2_hgnetv2_s_coco.yml +76 -0
- configs/deimv2/deimv2_hgnetv2_x_coco.yml +60 -0
- configs/runtime.yml +20 -0
- engine/__init__.py +13 -10
- engine/backbone/vit_tiny.py +15 -40
configs/base/dataloader.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
train_dataloader:
|
| 3 |
+
dataset:
|
| 4 |
+
transforms:
|
| 5 |
+
ops:
|
| 6 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 7 |
+
- {type: RandomZoomOut, fill: 0}
|
| 8 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 9 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 10 |
+
- {type: RandomHorizontalFlip}
|
| 11 |
+
- {type: Resize, size: [640, 640], }
|
| 12 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 13 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 14 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 15 |
+
policy:
|
| 16 |
+
name: stop_epoch
|
| 17 |
+
epoch: 72 # epoch in [71, ~) stop `ops`
|
| 18 |
+
ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] # Mosaicを除外
|
| 19 |
+
|
| 20 |
+
collate_fn:
|
| 21 |
+
type: BatchImageCollateFunction
|
| 22 |
+
base_size: 640
|
| 23 |
+
base_size_repeat: 3
|
| 24 |
+
stop_epoch: 72 # epoch in [72, ~) stop `multiscales`
|
| 25 |
+
|
| 26 |
+
shuffle: True
|
| 27 |
+
total_batch_size: 32 # total batch size equals to 32 (4 * 8)
|
| 28 |
+
num_workers: 4
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
val_dataloader:
|
| 32 |
+
dataset:
|
| 33 |
+
transforms:
|
| 34 |
+
ops:
|
| 35 |
+
- {type: Resize, size: [640, 640], }
|
| 36 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 37 |
+
shuffle: False
|
| 38 |
+
total_batch_size: 64
|
| 39 |
+
num_workers: 4
|
configs/base/deim.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dense O2O
|
| 2 |
+
train_dataloader:
|
| 3 |
+
dataset:
|
| 4 |
+
transforms:
|
| 5 |
+
ops:
|
| 6 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 7 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 8 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 9 |
+
- {type: RandomZoomOut, fill: 0}
|
| 10 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 11 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 12 |
+
- {type: RandomHorizontalFlip}
|
| 13 |
+
- {type: Resize, size: [640, 640], }
|
| 14 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 15 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 16 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 17 |
+
policy:
|
| 18 |
+
epoch: [4, 29, 50] # list
|
| 19 |
+
ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
|
| 20 |
+
mosaic_prob: 0.5
|
| 21 |
+
|
| 22 |
+
collate_fn:
|
| 23 |
+
mixup_prob: 0.5
|
| 24 |
+
mixup_epochs: [4, 29]
|
| 25 |
+
stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
|
| 26 |
+
|
| 27 |
+
# Unfreezing BN
|
| 28 |
+
HGNetv2:
|
| 29 |
+
freeze_at: -1 # 0 default
|
| 30 |
+
freeze_norm: False # True default
|
| 31 |
+
|
| 32 |
+
# Activation
|
| 33 |
+
DFINETransformer:
|
| 34 |
+
activation: silu
|
| 35 |
+
mlp_act: silu
|
| 36 |
+
|
| 37 |
+
## Our LR-Scheduler
|
| 38 |
+
lrsheduler: flatcosine
|
| 39 |
+
lr_gamma: 0.5
|
| 40 |
+
warmup_iter: 2000
|
| 41 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 42 |
+
no_aug_epoch: 8
|
| 43 |
+
|
| 44 |
+
## Our Loss
|
| 45 |
+
DEIMCriterion:
|
| 46 |
+
weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
|
| 47 |
+
losses: ['mal', 'boxes', 'local']
|
| 48 |
+
gamma: 1.5
|
configs/base/deimv2.yml
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
model: DEIM
|
| 4 |
+
criterion: DEIMCriterion
|
| 5 |
+
postprocessor: PostProcessor
|
| 6 |
+
|
| 7 |
+
use_focal_loss: True
|
| 8 |
+
eval_spatial_size: [640, 640] # h w
|
| 9 |
+
checkpoint_freq: 5 # save freq
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
backbone: HGNetv2
|
| 13 |
+
encoder: HybridEncoder
|
| 14 |
+
decoder: DEIMTransformer
|
| 15 |
+
|
| 16 |
+
HGNetv2:
|
| 17 |
+
name: 'B4'
|
| 18 |
+
return_idx: [1, 2, 3]
|
| 19 |
+
freeze_at: -1 # 0 default
|
| 20 |
+
freeze_stem_only: True
|
| 21 |
+
freeze_norm: False # True default
|
| 22 |
+
pretrained: True
|
| 23 |
+
local_model_dir: ./weight/hgnetv2/
|
| 24 |
+
|
| 25 |
+
HybridEncoder:
|
| 26 |
+
in_channels: [512, 1024, 2048]
|
| 27 |
+
feat_strides: [8, 16, 32]
|
| 28 |
+
|
| 29 |
+
# intra
|
| 30 |
+
hidden_dim: 256
|
| 31 |
+
use_encoder_idx: [2]
|
| 32 |
+
num_encoder_layers: 1
|
| 33 |
+
nhead: 8
|
| 34 |
+
dim_feedforward: 1024
|
| 35 |
+
dropout: 0.
|
| 36 |
+
enc_act: 'gelu'
|
| 37 |
+
|
| 38 |
+
# cross
|
| 39 |
+
expansion: 1.0
|
| 40 |
+
depth_mult: 1
|
| 41 |
+
act: 'silu'
|
| 42 |
+
|
| 43 |
+
# New
|
| 44 |
+
version: deim
|
| 45 |
+
csp_type: csp2
|
| 46 |
+
fuse_op: sum
|
| 47 |
+
|
| 48 |
+
DEIMTransformer:
|
| 49 |
+
feat_channels: [256, 256, 256]
|
| 50 |
+
feat_strides: [8, 16, 32]
|
| 51 |
+
hidden_dim: 256
|
| 52 |
+
num_levels: 3
|
| 53 |
+
|
| 54 |
+
num_layers: 6
|
| 55 |
+
eval_idx: -1
|
| 56 |
+
num_queries: 300
|
| 57 |
+
|
| 58 |
+
num_denoising: 100
|
| 59 |
+
label_noise_ratio: 0.5
|
| 60 |
+
box_noise_scale: 1.0
|
| 61 |
+
|
| 62 |
+
reg_max: 32
|
| 63 |
+
reg_scale: 4
|
| 64 |
+
layer_scale: 1 # 2
|
| 65 |
+
|
| 66 |
+
num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
|
| 67 |
+
cross_attn_method: default # default, discrete
|
| 68 |
+
query_select_method: default # default, agnostic
|
| 69 |
+
|
| 70 |
+
# Act
|
| 71 |
+
activation: silu
|
| 72 |
+
mlp_act: silu
|
| 73 |
+
|
| 74 |
+
# FFN
|
| 75 |
+
dim_feedforward: 2048
|
| 76 |
+
|
| 77 |
+
PostProcessor:
|
| 78 |
+
num_top_queries: 300
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
## DEIM LR-Scheduler
|
| 82 |
+
epoches: 58 # 72 + 2n # Increase to search for the optimal ema
|
| 83 |
+
|
| 84 |
+
lrsheduler: flatcosine
|
| 85 |
+
lr_gamma: 0.5
|
| 86 |
+
warmup_iter: 2000
|
| 87 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 88 |
+
no_aug_epoch: 8
|
| 89 |
+
|
| 90 |
+
## Dense O2O: Mosaic + Mixup + CopyBlend
|
| 91 |
+
train_dataloader:
|
| 92 |
+
dataset:
|
| 93 |
+
transforms:
|
| 94 |
+
ops:
|
| 95 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 96 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 97 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 98 |
+
- {type: RandomZoomOut, fill: 0}
|
| 99 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 100 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 101 |
+
- {type: RandomHorizontalFlip}
|
| 102 |
+
- {type: Resize, size: [640, 640], }
|
| 103 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 104 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 105 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 106 |
+
# Mosaic options
|
| 107 |
+
policy:
|
| 108 |
+
epoch: [4, 29, 50] # list
|
| 109 |
+
ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
|
| 110 |
+
mosaic_prob: 0.5
|
| 111 |
+
|
| 112 |
+
collate_fn:
|
| 113 |
+
# Mixup options
|
| 114 |
+
mixup_prob: 0.5
|
| 115 |
+
mixup_epochs: [4, 29]
|
| 116 |
+
stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
|
| 117 |
+
# CopyBlend options
|
| 118 |
+
copyblend_prob: 0.5
|
| 119 |
+
copyblend_epochs: [4, 50]
|
| 120 |
+
area_threshold: 100
|
| 121 |
+
num_objects: 3
|
| 122 |
+
with_expand: True
|
| 123 |
+
expand_ratios: [0.1, 0.25]
|
| 124 |
+
|
| 125 |
+
ema_restart_decay: 0.9999
|
| 126 |
+
base_size_repeat: 4
|
| 127 |
+
|
| 128 |
+
## DEIM Loss
|
| 129 |
+
DEIMCriterion:
|
| 130 |
+
weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
|
| 131 |
+
losses: ['mal', 'boxes', 'local']
|
| 132 |
+
gamma: 1.5
|
| 133 |
+
alpha: 0.75
|
| 134 |
+
reg_max: 32
|
| 135 |
+
|
| 136 |
+
matcher:
|
| 137 |
+
type: HungarianMatcher
|
| 138 |
+
weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
|
| 139 |
+
alpha: 0.25
|
| 140 |
+
gamma: 2.0
|
| 141 |
+
# change matcher
|
| 142 |
+
change_matcher: True
|
| 143 |
+
iou_order_alpha: 4.0
|
| 144 |
+
matcher_change_epoch: 45
|
configs/base/dfine_hgnetv2.yml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
model: DEIM
|
| 4 |
+
criterion: DEIMCriterion
|
| 5 |
+
postprocessor: PostProcessor
|
| 6 |
+
|
| 7 |
+
use_focal_loss: True
|
| 8 |
+
eval_spatial_size: [640, 640] # h w
|
| 9 |
+
checkpoint_freq: 4 # save freq
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
backbone: HGNetv2
|
| 13 |
+
encoder: HybridEncoder
|
| 14 |
+
decoder: DFINETransformer
|
| 15 |
+
|
| 16 |
+
# Add, default for step lr scheduler
|
| 17 |
+
lrsheduler: flatcosine
|
| 18 |
+
lr_gamma: 1
|
| 19 |
+
warmup_iter: 500
|
| 20 |
+
flat_epoch: 4000000
|
| 21 |
+
no_aug_epoch: 0
|
| 22 |
+
|
| 23 |
+
HGNetv2:
|
| 24 |
+
pretrained: True
|
| 25 |
+
local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/
|
| 26 |
+
|
| 27 |
+
HybridEncoder:
|
| 28 |
+
in_channels: [512, 1024, 2048]
|
| 29 |
+
feat_strides: [8, 16, 32]
|
| 30 |
+
|
| 31 |
+
# intra
|
| 32 |
+
hidden_dim: 256
|
| 33 |
+
use_encoder_idx: [2]
|
| 34 |
+
num_encoder_layers: 1
|
| 35 |
+
nhead: 8
|
| 36 |
+
dim_feedforward: 1024
|
| 37 |
+
dropout: 0.
|
| 38 |
+
enc_act: 'gelu'
|
| 39 |
+
|
| 40 |
+
# cross
|
| 41 |
+
expansion: 1.0
|
| 42 |
+
depth_mult: 1
|
| 43 |
+
act: 'silu'
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
DFINETransformer:
|
| 47 |
+
feat_channels: [256, 256, 256]
|
| 48 |
+
feat_strides: [8, 16, 32]
|
| 49 |
+
hidden_dim: 256
|
| 50 |
+
num_levels: 3
|
| 51 |
+
|
| 52 |
+
num_layers: 6
|
| 53 |
+
eval_idx: -1
|
| 54 |
+
num_queries: 300
|
| 55 |
+
|
| 56 |
+
num_denoising: 100
|
| 57 |
+
label_noise_ratio: 0.5
|
| 58 |
+
box_noise_scale: 1.0
|
| 59 |
+
|
| 60 |
+
# NEW
|
| 61 |
+
reg_max: 32
|
| 62 |
+
reg_scale: 4
|
| 63 |
+
|
| 64 |
+
# Auxiliary decoder layers dimension scaling
|
| 65 |
+
# "eg. If num_layers: 6 eval_idx: -4,
|
| 66 |
+
# then layer 3, 4, 5 are auxiliary decoder layers."
|
| 67 |
+
layer_scale: 1 # 2
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
|
| 71 |
+
cross_attn_method: default # default, discrete
|
| 72 |
+
query_select_method: default # default, agnostic
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
PostProcessor:
|
| 76 |
+
num_top_queries: 300
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
DEIMCriterion:
|
| 80 |
+
weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
|
| 81 |
+
losses: ['vfl', 'boxes', 'local']
|
| 82 |
+
alpha: 0.75
|
| 83 |
+
gamma: 2.0
|
| 84 |
+
reg_max: 32
|
| 85 |
+
|
| 86 |
+
matcher:
|
| 87 |
+
type: HungarianMatcher
|
| 88 |
+
weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
|
| 89 |
+
alpha: 0.25
|
| 90 |
+
gamma: 2.0
|
configs/base/optimizer.yml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
use_amp: True
|
| 2 |
+
use_ema: True
|
| 3 |
+
ema:
|
| 4 |
+
type: ModelEMA
|
| 5 |
+
decay: 0.9999
|
| 6 |
+
warmups: 1000
|
| 7 |
+
start: 0
|
| 8 |
+
|
| 9 |
+
epoches: 72
|
| 10 |
+
clip_max_norm: 0.1
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
optimizer:
|
| 14 |
+
type: AdamW
|
| 15 |
+
params:
|
| 16 |
+
-
|
| 17 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 18 |
+
lr: 0.0000125
|
| 19 |
+
-
|
| 20 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 21 |
+
weight_decay: 0.
|
| 22 |
+
|
| 23 |
+
lr: 0.00025
|
| 24 |
+
betas: [0.9, 0.999]
|
| 25 |
+
weight_decay: 0.000125
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
lr_scheduler:
|
| 29 |
+
type: MultiStepLR
|
| 30 |
+
milestones: [500]
|
| 31 |
+
gamma: 0.1
|
| 32 |
+
|
| 33 |
+
lr_warmup_scheduler:
|
| 34 |
+
type: LinearWarmup
|
| 35 |
+
warmup_duration: 500
|
configs/base/rt_deim.yml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dense O2O
|
| 2 |
+
train_dataloader:
|
| 3 |
+
dataset:
|
| 4 |
+
transforms:
|
| 5 |
+
ops:
|
| 6 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 7 |
+
probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True}
|
| 8 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 9 |
+
- {type: RandomZoomOut, fill: 0}
|
| 10 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 11 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 12 |
+
- {type: RandomHorizontalFlip}
|
| 13 |
+
- {type: Resize, size: [640, 640], }
|
| 14 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 15 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 16 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 17 |
+
policy:
|
| 18 |
+
epoch: [4, 29, 50] # list
|
| 19 |
+
ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
|
| 20 |
+
mosaic_prob: 0.5
|
| 21 |
+
|
| 22 |
+
collate_fn:
|
| 23 |
+
mixup_prob: 0.5
|
| 24 |
+
mixup_epochs: [4, 29]
|
| 25 |
+
stop_epoch: 50 # epoch in [72, ~) stop `multiscales`
|
| 26 |
+
|
| 27 |
+
# Unfreezing BN
|
| 28 |
+
PResNet:
|
| 29 |
+
freeze_at: -1 # default 0
|
| 30 |
+
freeze_norm: False # default True
|
| 31 |
+
|
| 32 |
+
# Activation
|
| 33 |
+
RTDETRTransformerv2:
|
| 34 |
+
query_pos_method: as_reg
|
| 35 |
+
activation: silu
|
| 36 |
+
mlp_act: silu
|
| 37 |
+
|
| 38 |
+
## Our LR-Scheduler
|
| 39 |
+
lrsheduler: flatcosine
|
| 40 |
+
lr_gamma: 0.5
|
| 41 |
+
warmup_iter: 2000
|
| 42 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 43 |
+
no_aug_epoch: 8
|
| 44 |
+
|
| 45 |
+
## Our Loss
|
| 46 |
+
DEIMCriterion:
|
| 47 |
+
weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2}
|
| 48 |
+
losses: ['mal', 'boxes', ]
|
| 49 |
+
gamma: 1.5
|
configs/base/rt_optimizer.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
use_amp: True
|
| 2 |
+
use_ema: True
|
| 3 |
+
ema:
|
| 4 |
+
type: ModelEMA
|
| 5 |
+
decay: 0.9999
|
| 6 |
+
warmups: 2000
|
| 7 |
+
start: 0
|
| 8 |
+
|
| 9 |
+
epoches: 72
|
| 10 |
+
clip_max_norm: 0.1
|
| 11 |
+
|
| 12 |
+
train_dataloader:
|
| 13 |
+
total_batch_size: 16
|
| 14 |
+
|
| 15 |
+
optimizer:
|
| 16 |
+
type: AdamW
|
| 17 |
+
params:
|
| 18 |
+
-
|
| 19 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 20 |
+
lr: 0.00001
|
| 21 |
+
-
|
| 22 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 23 |
+
weight_decay: 0.
|
| 24 |
+
|
| 25 |
+
lr: 0.0001
|
| 26 |
+
betas: [0.9, 0.999]
|
| 27 |
+
weight_decay: 0.0001
|
| 28 |
+
|
| 29 |
+
lr_scheduler:
|
| 30 |
+
type: MultiStepLR
|
| 31 |
+
milestones: [1000]
|
| 32 |
+
gamma: 0.1
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
lr_warmup_scheduler:
|
| 36 |
+
type: LinearWarmup
|
| 37 |
+
warmup_duration: 2000
|
configs/base/rtdetrv2_r50vd.yml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
model: DEIM
|
| 4 |
+
criterion: DEIMCriterion
|
| 5 |
+
postprocessor: PostProcessor
|
| 6 |
+
|
| 7 |
+
use_focal_loss: True
|
| 8 |
+
eval_spatial_size: [640, 640] # h w
|
| 9 |
+
checkpoint_freq: 4 # save freq
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
backbone: PResNet
|
| 13 |
+
encoder: HybridEncoder
|
| 14 |
+
decoder: RTDETRTransformerv2
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Add, default for step lr scheduler
|
| 18 |
+
lrsheduler: flatcosine
|
| 19 |
+
lr_gamma: 1
|
| 20 |
+
warmup_iter: 2000
|
| 21 |
+
flat_epoch: 4000000
|
| 22 |
+
no_aug_epoch: 0
|
| 23 |
+
|
| 24 |
+
PResNet:
|
| 25 |
+
depth: 50
|
| 26 |
+
variant: d
|
| 27 |
+
freeze_at: 0
|
| 28 |
+
return_idx: [1, 2, 3]
|
| 29 |
+
num_stages: 4
|
| 30 |
+
freeze_norm: True
|
| 31 |
+
pretrained: True
|
| 32 |
+
local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
HybridEncoder:
|
| 36 |
+
in_channels: [512, 1024, 2048]
|
| 37 |
+
feat_strides: [8, 16, 32]
|
| 38 |
+
|
| 39 |
+
# intra
|
| 40 |
+
hidden_dim: 256
|
| 41 |
+
use_encoder_idx: [2]
|
| 42 |
+
num_encoder_layers: 1
|
| 43 |
+
nhead: 8
|
| 44 |
+
dim_feedforward: 1024
|
| 45 |
+
dropout: 0.
|
| 46 |
+
enc_act: 'gelu'
|
| 47 |
+
|
| 48 |
+
# cross
|
| 49 |
+
expansion: 1.0
|
| 50 |
+
depth_mult: 1
|
| 51 |
+
act: 'silu'
|
| 52 |
+
version: rt_detrv2 # pay attention to this
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
RTDETRTransformerv2:
|
| 56 |
+
feat_channels: [256, 256, 256]
|
| 57 |
+
feat_strides: [8, 16, 32]
|
| 58 |
+
hidden_dim: 256
|
| 59 |
+
num_levels: 3
|
| 60 |
+
|
| 61 |
+
num_layers: 6
|
| 62 |
+
num_queries: 300
|
| 63 |
+
|
| 64 |
+
num_denoising: 100
|
| 65 |
+
label_noise_ratio: 0.5
|
| 66 |
+
box_noise_scale: 1.0 # 1.0 0.4
|
| 67 |
+
|
| 68 |
+
eval_idx: -1
|
| 69 |
+
|
| 70 |
+
# NEW, can be chosen
|
| 71 |
+
num_points: [4, 4, 4] # [3,3,3] [2,2,2]
|
| 72 |
+
cross_attn_method: default # default, discrete
|
| 73 |
+
query_select_method: default # default, agnostic
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
PostProcessor:
|
| 77 |
+
num_top_queries: 300
|
| 78 |
+
|
| 79 |
+
DEIMCriterion:
|
| 80 |
+
weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
|
| 81 |
+
losses: ['vfl', 'boxes', ]
|
| 82 |
+
alpha: 0.75
|
| 83 |
+
gamma: 2.0
|
| 84 |
+
use_uni_set: False
|
| 85 |
+
|
| 86 |
+
matcher:
|
| 87 |
+
type: HungarianMatcher
|
| 88 |
+
weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
|
| 89 |
+
alpha: 0.25
|
| 90 |
+
gamma: 2.0
|
configs/{coco_detection.yml → dataset/coco_detection.yml}
RENAMED
|
File without changes
|
configs/dataset/crowdhuman_detection.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
evaluator:
|
| 4 |
+
type: CocoEvaluator
|
| 5 |
+
iou_types: ['bbox', ]
|
| 6 |
+
|
| 7 |
+
num_classes: 2 # your dataset classes
|
| 8 |
+
remap_mscoco_category: False
|
| 9 |
+
|
| 10 |
+
train_dataloader:
|
| 11 |
+
type: DataLoader
|
| 12 |
+
dataset:
|
| 13 |
+
type: CocoDetection
|
| 14 |
+
img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train
|
| 15 |
+
ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json
|
| 16 |
+
return_masks: False
|
| 17 |
+
transforms:
|
| 18 |
+
type: Compose
|
| 19 |
+
ops: ~
|
| 20 |
+
shuffle: True
|
| 21 |
+
num_workers: 4
|
| 22 |
+
drop_last: True
|
| 23 |
+
collate_fn:
|
| 24 |
+
type: BatchImageCollateFunction
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
val_dataloader:
|
| 28 |
+
type: DataLoader
|
| 29 |
+
dataset:
|
| 30 |
+
type: CocoDetection
|
| 31 |
+
img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val
|
| 32 |
+
ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json
|
| 33 |
+
return_masks: False
|
| 34 |
+
transforms:
|
| 35 |
+
type: Compose
|
| 36 |
+
ops: ~
|
| 37 |
+
shuffle: False
|
| 38 |
+
num_workers: 4
|
| 39 |
+
drop_last: False
|
| 40 |
+
collate_fn:
|
| 41 |
+
type: BatchImageCollateFunction
|
configs/dataset/custom_detection.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
evaluator:
|
| 4 |
+
type: CocoEvaluator
|
| 5 |
+
iou_types: ['bbox', ]
|
| 6 |
+
|
| 7 |
+
num_classes: 777 # your dataset classes
|
| 8 |
+
remap_mscoco_category: False
|
| 9 |
+
|
| 10 |
+
train_dataloader:
|
| 11 |
+
type: DataLoader
|
| 12 |
+
dataset:
|
| 13 |
+
type: CocoDetection
|
| 14 |
+
img_folder: /data/yourdataset/train
|
| 15 |
+
ann_file: /data/yourdataset/train/train.json
|
| 16 |
+
return_masks: False
|
| 17 |
+
transforms:
|
| 18 |
+
type: Compose
|
| 19 |
+
ops: ~
|
| 20 |
+
shuffle: True
|
| 21 |
+
num_workers: 4
|
| 22 |
+
drop_last: True
|
| 23 |
+
collate_fn:
|
| 24 |
+
type: BatchImageCollateFunction
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
val_dataloader:
|
| 28 |
+
type: DataLoader
|
| 29 |
+
dataset:
|
| 30 |
+
type: CocoDetection
|
| 31 |
+
img_folder: /data/yourdataset/val
|
| 32 |
+
ann_file: /data/yourdataset/val/val.json
|
| 33 |
+
return_masks: False
|
| 34 |
+
transforms:
|
| 35 |
+
type: Compose
|
| 36 |
+
ops: ~
|
| 37 |
+
shuffle: False
|
| 38 |
+
num_workers: 4
|
| 39 |
+
drop_last: False
|
| 40 |
+
collate_fn:
|
| 41 |
+
type: BatchImageCollateFunction
|
configs/dataset/obj365_detection.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
evaluator:
|
| 4 |
+
type: CocoEvaluator
|
| 5 |
+
iou_types: ['bbox', ]
|
| 6 |
+
|
| 7 |
+
num_classes: 366
|
| 8 |
+
remap_mscoco_category: False
|
| 9 |
+
|
| 10 |
+
train_dataloader:
|
| 11 |
+
type: DataLoader
|
| 12 |
+
dataset:
|
| 13 |
+
type: CocoDetection
|
| 14 |
+
img_folder: /home/Dataset/objects365/train
|
| 15 |
+
ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json
|
| 16 |
+
return_masks: False
|
| 17 |
+
transforms:
|
| 18 |
+
type: Compose
|
| 19 |
+
ops: ~
|
| 20 |
+
shuffle: True
|
| 21 |
+
num_workers: 4
|
| 22 |
+
drop_last: True
|
| 23 |
+
collate_fn:
|
| 24 |
+
type: BatchImageCollateFunction
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
val_dataloader:
|
| 28 |
+
type: DataLoader
|
| 29 |
+
dataset:
|
| 30 |
+
type: CocoDetection
|
| 31 |
+
img_folder: /home/Dataset/objects365/val
|
| 32 |
+
ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json
|
| 33 |
+
return_masks: False
|
| 34 |
+
transforms:
|
| 35 |
+
type: Compose
|
| 36 |
+
ops: ~
|
| 37 |
+
shuffle: False
|
| 38 |
+
num_workers: 4
|
| 39 |
+
drop_last: False
|
| 40 |
+
collate_fn:
|
| 41 |
+
type: BatchImageCollateFunction
|
configs/dataset/voc_detection.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: detection
|
| 2 |
+
|
| 3 |
+
evaluator:
|
| 4 |
+
type: CocoEvaluator
|
| 5 |
+
iou_types: ['bbox', ]
|
| 6 |
+
|
| 7 |
+
num_classes: 20
|
| 8 |
+
|
| 9 |
+
train_dataloader:
|
| 10 |
+
type: DataLoader
|
| 11 |
+
dataset:
|
| 12 |
+
type: VOCDetection
|
| 13 |
+
root: ./dataset/voc/
|
| 14 |
+
ann_file: trainval.txt
|
| 15 |
+
label_file: label_list.txt
|
| 16 |
+
transforms:
|
| 17 |
+
type: Compose
|
| 18 |
+
ops: ~
|
| 19 |
+
shuffle: True
|
| 20 |
+
num_workers: 4
|
| 21 |
+
drop_last: True
|
| 22 |
+
collate_fn:
|
| 23 |
+
type: BatchImageCollateFunction
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
val_dataloader:
|
| 27 |
+
type: DataLoader
|
| 28 |
+
dataset:
|
| 29 |
+
type: VOCDetection
|
| 30 |
+
root: ./dataset/voc/
|
| 31 |
+
ann_file: test.txt
|
| 32 |
+
label_file: label_list.txt
|
| 33 |
+
transforms:
|
| 34 |
+
type: Compose
|
| 35 |
+
ops: ~
|
| 36 |
+
shuffle: False
|
| 37 |
+
num_workers: 4
|
| 38 |
+
drop_last: False
|
| 39 |
+
collate_fn:
|
| 40 |
+
type: BatchImageCollateFunction
|
configs/deim_dfine/deim_hgnetv2_l_coco.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_l_coco.yml',
|
| 3 |
+
'../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_hgnetv2_l_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 13 |
+
lr: 0.000025
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0005
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.000125
|
| 21 |
+
|
| 22 |
+
# Increase to search for the optimal ema
|
| 23 |
+
epoches: 58 # 72 + 2n
|
| 24 |
+
|
| 25 |
+
## Our LR-Scheduler
|
| 26 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 27 |
+
no_aug_epoch: 8
|
| 28 |
+
|
| 29 |
+
train_dataloader:
|
| 30 |
+
dataset:
|
| 31 |
+
transforms:
|
| 32 |
+
policy:
|
| 33 |
+
epoch: [4, 29, 50] # list
|
| 34 |
+
|
| 35 |
+
collate_fn:
|
| 36 |
+
mixup_epochs: [4, 29]
|
| 37 |
+
stop_epoch: 50
|
configs/deim_dfine/deim_hgnetv2_m_coco.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_m_coco.yml',
|
| 3 |
+
'../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_hgnetv2_m_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*bn).*$'
|
| 13 |
+
lr: 0.00004
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0004
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.0001
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Increase to search for the optimal ema
|
| 24 |
+
epoches: 102 # 120 + 4n
|
| 25 |
+
|
| 26 |
+
## Our LR-Scheduler
|
| 27 |
+
flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 28 |
+
no_aug_epoch: 12
|
| 29 |
+
|
| 30 |
+
## Our DataAug
|
| 31 |
+
train_dataloader:
|
| 32 |
+
dataset:
|
| 33 |
+
transforms:
|
| 34 |
+
policy:
|
| 35 |
+
epoch: [4, 49, 90] # list
|
| 36 |
+
|
| 37 |
+
collate_fn:
|
| 38 |
+
mixup_epochs: [4, 49]
|
| 39 |
+
stop_epoch: 90
|
configs/deim_dfine/deim_hgnetv2_n_coco.yml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_n_coco.yml',
|
| 3 |
+
'../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./deim_outputs/deim_hgnetv2_n_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 13 |
+
lr: 0.0004
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 16 |
+
lr: 0.0004
|
| 17 |
+
weight_decay: 0.
|
| 18 |
+
-
|
| 19 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 20 |
+
weight_decay: 0.
|
| 21 |
+
|
| 22 |
+
lr: 0.0008
|
| 23 |
+
betas: [0.9, 0.999]
|
| 24 |
+
weight_decay: 0.0001
|
| 25 |
+
|
| 26 |
+
# Increase to search for the optimal ema
|
| 27 |
+
epoches: 160 # 148 + 12
|
| 28 |
+
|
| 29 |
+
## Our LR-Scheduler
|
| 30 |
+
flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 31 |
+
no_aug_epoch: 12
|
| 32 |
+
lr_gamma: 1.0
|
| 33 |
+
|
| 34 |
+
## Our DataAug
|
| 35 |
+
train_dataloader:
|
| 36 |
+
dataset:
|
| 37 |
+
transforms:
|
| 38 |
+
policy:
|
| 39 |
+
epoch: [4, 78, 148] # list
|
| 40 |
+
|
| 41 |
+
collate_fn:
|
| 42 |
+
mixup_epochs: [4, 78]
|
| 43 |
+
stop_epoch: 148
|
| 44 |
+
base_size_repeat: ~
|
configs/deim_dfine/deim_hgnetv2_s_coco.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_s_coco.yml',
|
| 3 |
+
'../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_hgnetv2_s_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*bn).*$'
|
| 13 |
+
lr: 0.0002
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:norm|bn)).*$' # except bias
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0004
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.0001
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Increase to search for the optimal ema
|
| 24 |
+
epoches: 132 # 120 + 4n
|
| 25 |
+
|
| 26 |
+
## Our LR-Scheduler
|
| 27 |
+
flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 28 |
+
no_aug_epoch: 12
|
| 29 |
+
|
| 30 |
+
## Our DataAug
|
| 31 |
+
train_dataloader:
|
| 32 |
+
dataset:
|
| 33 |
+
transforms:
|
| 34 |
+
policy:
|
| 35 |
+
epoch: [4, 64, 120] # list
|
| 36 |
+
|
| 37 |
+
collate_fn:
|
| 38 |
+
mixup_epochs: [4, 64]
|
| 39 |
+
stop_epoch: 120
|
configs/deim_dfine/deim_hgnetv2_x_coco.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_x_coco.yml',
|
| 3 |
+
'../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_hgnetv2_x_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 13 |
+
lr: 0.000005
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0005
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.000125
|
| 21 |
+
|
| 22 |
+
# Increase to search for the optimal ema
|
| 23 |
+
epoches: 58 # 72 + 2n
|
| 24 |
+
|
| 25 |
+
## Our LR-Scheduler
|
| 26 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 27 |
+
no_aug_epoch: 8
|
| 28 |
+
|
| 29 |
+
train_dataloader:
|
| 30 |
+
dataset:
|
| 31 |
+
transforms:
|
| 32 |
+
policy:
|
| 33 |
+
epoch: [4, 29, 50] # list
|
| 34 |
+
|
| 35 |
+
collate_fn:
|
| 36 |
+
mixup_epochs: [4, 29]
|
| 37 |
+
stop_epoch: 50
|
configs/deim_dfine/dfine_hgnetv2_l_coco.yml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/dfine_hgnetv2_l_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
HGNetv2:
|
| 13 |
+
name: 'B4'
|
| 14 |
+
return_idx: [1, 2, 3]
|
| 15 |
+
freeze_stem_only: True
|
| 16 |
+
freeze_at: 0
|
| 17 |
+
freeze_norm: True
|
| 18 |
+
|
| 19 |
+
optimizer:
|
| 20 |
+
type: AdamW
|
| 21 |
+
params:
|
| 22 |
+
-
|
| 23 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 24 |
+
lr: 0.0000125
|
| 25 |
+
-
|
| 26 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 27 |
+
weight_decay: 0.
|
| 28 |
+
|
| 29 |
+
lr: 0.00025
|
| 30 |
+
betas: [0.9, 0.999]
|
| 31 |
+
weight_decay: 0.000125
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Increase to search for the optimal ema
|
| 35 |
+
epoches: 80 # 72 + 2n
|
| 36 |
+
train_dataloader:
|
| 37 |
+
dataset:
|
| 38 |
+
transforms:
|
| 39 |
+
policy:
|
| 40 |
+
epoch: 72
|
| 41 |
+
collate_fn:
|
| 42 |
+
stop_epoch: 72
|
| 43 |
+
ema_restart_decay: 0.9999
|
| 44 |
+
base_size_repeat: 4
|
configs/deim_dfine/dfine_hgnetv2_m_coco.yml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./output/dfine_hgnetv2_m_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: HGNetv2
|
| 14 |
+
|
| 15 |
+
HGNetv2:
|
| 16 |
+
name: 'B2'
|
| 17 |
+
return_idx: [1, 2, 3]
|
| 18 |
+
freeze_at: -1
|
| 19 |
+
freeze_norm: False
|
| 20 |
+
use_lab: True
|
| 21 |
+
|
| 22 |
+
DFINETransformer:
|
| 23 |
+
num_layers: 4 # 5 6
|
| 24 |
+
eval_idx: -1 # -2 -3
|
| 25 |
+
|
| 26 |
+
HybridEncoder:
|
| 27 |
+
in_channels: [384, 768, 1536]
|
| 28 |
+
hidden_dim: 256
|
| 29 |
+
depth_mult: 0.67
|
| 30 |
+
|
| 31 |
+
optimizer:
|
| 32 |
+
type: AdamW
|
| 33 |
+
params:
|
| 34 |
+
-
|
| 35 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 36 |
+
lr: 0.00002
|
| 37 |
+
-
|
| 38 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 39 |
+
lr: 0.00002
|
| 40 |
+
weight_decay: 0.
|
| 41 |
+
-
|
| 42 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 43 |
+
weight_decay: 0.
|
| 44 |
+
|
| 45 |
+
lr: 0.0002
|
| 46 |
+
betas: [0.9, 0.999]
|
| 47 |
+
weight_decay: 0.0001
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Increase to search for the optimal ema
|
| 51 |
+
epoches: 132 # 120 + 4n
|
| 52 |
+
train_dataloader:
|
| 53 |
+
dataset:
|
| 54 |
+
transforms:
|
| 55 |
+
policy:
|
| 56 |
+
epoch: 120
|
| 57 |
+
collate_fn:
|
| 58 |
+
stop_epoch: 120
|
| 59 |
+
ema_restart_decay: 0.9999
|
| 60 |
+
base_size_repeat: 6
|
configs/deim_dfine/dfine_hgnetv2_n_coco.yml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./output/dfine_hgnetv2_n_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: HGNetv2
|
| 14 |
+
|
| 15 |
+
HGNetv2:
|
| 16 |
+
name: 'B0'
|
| 17 |
+
return_idx: [2, 3]
|
| 18 |
+
freeze_at: -1
|
| 19 |
+
freeze_norm: False
|
| 20 |
+
use_lab: True
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
HybridEncoder:
|
| 24 |
+
in_channels: [512, 1024]
|
| 25 |
+
feat_strides: [16, 32]
|
| 26 |
+
|
| 27 |
+
# intra
|
| 28 |
+
hidden_dim: 128
|
| 29 |
+
use_encoder_idx: [1]
|
| 30 |
+
dim_feedforward: 512
|
| 31 |
+
|
| 32 |
+
# cross
|
| 33 |
+
expansion: 0.34
|
| 34 |
+
depth_mult: 0.5
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
DFINETransformer:
|
| 38 |
+
feat_channels: [128, 128]
|
| 39 |
+
feat_strides: [16, 32]
|
| 40 |
+
hidden_dim: 128
|
| 41 |
+
dim_feedforward: 512
|
| 42 |
+
num_levels: 2
|
| 43 |
+
|
| 44 |
+
num_layers: 3
|
| 45 |
+
eval_idx: -1
|
| 46 |
+
|
| 47 |
+
num_points: [6, 6]
|
| 48 |
+
|
| 49 |
+
optimizer:
|
| 50 |
+
type: AdamW
|
| 51 |
+
params:
|
| 52 |
+
-
|
| 53 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 54 |
+
lr: 0.0004
|
| 55 |
+
-
|
| 56 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 57 |
+
lr: 0.0004
|
| 58 |
+
weight_decay: 0.
|
| 59 |
+
-
|
| 60 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 61 |
+
weight_decay: 0.
|
| 62 |
+
|
| 63 |
+
lr: 0.0008
|
| 64 |
+
betas: [0.9, 0.999]
|
| 65 |
+
weight_decay: 0.0001
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# Increase to search for the optimal ema
|
| 69 |
+
epoches: 160 # 148 + 4n
|
| 70 |
+
train_dataloader:
|
| 71 |
+
total_batch_size: 128
|
| 72 |
+
dataset:
|
| 73 |
+
transforms:
|
| 74 |
+
policy:
|
| 75 |
+
epoch: 148
|
| 76 |
+
collate_fn:
|
| 77 |
+
stop_epoch: 148
|
| 78 |
+
ema_restart_decay: 0.9999
|
| 79 |
+
base_size_repeat: ~
|
| 80 |
+
|
| 81 |
+
val_dataloader:
|
| 82 |
+
total_batch_size: 256
|
configs/deim_dfine/dfine_hgnetv2_s_coco.yml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./output/dfine_hgnetv2_s_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: HGNetv2
|
| 14 |
+
|
| 15 |
+
HGNetv2:
|
| 16 |
+
name: 'B0'
|
| 17 |
+
return_idx: [1, 2, 3]
|
| 18 |
+
freeze_at: -1
|
| 19 |
+
freeze_norm: False
|
| 20 |
+
use_lab: True
|
| 21 |
+
|
| 22 |
+
DFINETransformer:
|
| 23 |
+
num_layers: 3 # 4 5 6
|
| 24 |
+
eval_idx: -1 # -2 -3 -4
|
| 25 |
+
|
| 26 |
+
HybridEncoder:
|
| 27 |
+
in_channels: [256, 512, 1024]
|
| 28 |
+
hidden_dim: 256
|
| 29 |
+
depth_mult: 0.34
|
| 30 |
+
expansion: 0.5
|
| 31 |
+
|
| 32 |
+
optimizer:
|
| 33 |
+
type: AdamW
|
| 34 |
+
params:
|
| 35 |
+
-
|
| 36 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 37 |
+
lr: 0.0001
|
| 38 |
+
-
|
| 39 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 40 |
+
lr: 0.0001
|
| 41 |
+
weight_decay: 0.
|
| 42 |
+
-
|
| 43 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 44 |
+
weight_decay: 0.
|
| 45 |
+
|
| 46 |
+
lr: 0.0002
|
| 47 |
+
betas: [0.9, 0.999]
|
| 48 |
+
weight_decay: 0.0001
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# Increase to search for the optimal ema
|
| 52 |
+
epoches: 132 # 120 + 4n
|
| 53 |
+
train_dataloader:
|
| 54 |
+
dataset:
|
| 55 |
+
transforms:
|
| 56 |
+
policy:
|
| 57 |
+
epoch: 120
|
| 58 |
+
collate_fn:
|
| 59 |
+
stop_epoch: 120
|
| 60 |
+
ema_restart_decay: 0.9999
|
| 61 |
+
base_size_repeat: 20
|
configs/deim_dfine/dfine_hgnetv2_x_coco.yml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./output/dfine_hgnetv2_x_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: HGNetv2
|
| 14 |
+
|
| 15 |
+
HGNetv2:
|
| 16 |
+
name: 'B5'
|
| 17 |
+
return_idx: [1, 2, 3]
|
| 18 |
+
freeze_stem_only: True
|
| 19 |
+
freeze_at: 0
|
| 20 |
+
freeze_norm: True
|
| 21 |
+
|
| 22 |
+
HybridEncoder:
|
| 23 |
+
# intra
|
| 24 |
+
hidden_dim: 384
|
| 25 |
+
dim_feedforward: 2048
|
| 26 |
+
|
| 27 |
+
DFINETransformer:
|
| 28 |
+
feat_channels: [384, 384, 384]
|
| 29 |
+
reg_scale: 8
|
| 30 |
+
|
| 31 |
+
optimizer:
|
| 32 |
+
type: AdamW
|
| 33 |
+
params:
|
| 34 |
+
-
|
| 35 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 36 |
+
lr: 0.0000025
|
| 37 |
+
-
|
| 38 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 39 |
+
weight_decay: 0.
|
| 40 |
+
|
| 41 |
+
lr: 0.00025
|
| 42 |
+
betas: [0.9, 0.999]
|
| 43 |
+
weight_decay: 0.000125
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Increase to search for the optimal ema
|
| 47 |
+
epoches: 80 # 72 + 2n
|
| 48 |
+
train_dataloader:
|
| 49 |
+
dataset:
|
| 50 |
+
transforms:
|
| 51 |
+
policy:
|
| 52 |
+
epoch: 72
|
| 53 |
+
collate_fn:
|
| 54 |
+
stop_epoch: 72
|
| 55 |
+
ema_restart_decay: 0.9998
|
| 56 |
+
base_size_repeat: 3
|
configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./dfine_hgnetv2_x_obj2coco.yml',
|
| 3 |
+
'../../base/deim.yml'
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e
|
| 7 |
+
|
| 8 |
+
HGNetv2:
|
| 9 |
+
freeze_at: 0 # 0 default
|
| 10 |
+
freeze_norm: True # True default
|
| 11 |
+
|
| 12 |
+
# Activation
|
| 13 |
+
DFINETransformer:
|
| 14 |
+
activation: relu
|
| 15 |
+
mlp_act: relu
|
| 16 |
+
|
| 17 |
+
optimizer:
|
| 18 |
+
type: AdamW
|
| 19 |
+
params:
|
| 20 |
+
-
|
| 21 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 22 |
+
lr: 0.0000025
|
| 23 |
+
-
|
| 24 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 25 |
+
weight_decay: 0.
|
| 26 |
+
|
| 27 |
+
lr: 0.00025
|
| 28 |
+
betas: [0.9, 0.999]
|
| 29 |
+
weight_decay: 0.000125
|
| 30 |
+
|
| 31 |
+
# Increase to search for the optimal ema
|
| 32 |
+
epoches: 24 # 72 + 2n
|
| 33 |
+
|
| 34 |
+
## Our LR-Scheduler
|
| 35 |
+
lrsheduler: flatcosine
|
| 36 |
+
lr_gamma: 1
|
| 37 |
+
warmup_iter: 0 # 0
|
| 38 |
+
flat_epoch: 12000 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 39 |
+
no_aug_epoch: 4
|
| 40 |
+
|
| 41 |
+
## Our DataAug
|
| 42 |
+
train_dataloader:
|
| 43 |
+
dataset:
|
| 44 |
+
transforms:
|
| 45 |
+
policy:
|
| 46 |
+
epoch: [2, 12, 20] # list
|
| 47 |
+
|
| 48 |
+
collate_fn:
|
| 49 |
+
mixup_epochs: [2, 12]
|
| 50 |
+
stop_epoch: 20
|
configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../../dataset/coco_detection.yml',
|
| 3 |
+
'../../runtime.yml',
|
| 4 |
+
'../../base/dataloader.yml',
|
| 5 |
+
'../../base/optimizer.yml',
|
| 6 |
+
'../../base/dfine_hgnetv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/dfine_hgnetv2_x_obj2coco
|
| 10 |
+
|
| 11 |
+
HGNetv2:
|
| 12 |
+
name: 'B5'
|
| 13 |
+
return_idx: [1, 2, 3]
|
| 14 |
+
freeze_stem_only: True
|
| 15 |
+
freeze_at: 0
|
| 16 |
+
freeze_norm: True
|
| 17 |
+
|
| 18 |
+
HybridEncoder:
|
| 19 |
+
# intra
|
| 20 |
+
hidden_dim: 384
|
| 21 |
+
dim_feedforward: 2048
|
| 22 |
+
|
| 23 |
+
DFINETransformer:
|
| 24 |
+
feat_channels: [384, 384, 384]
|
| 25 |
+
reg_scale: 8
|
| 26 |
+
|
| 27 |
+
optimizer:
|
| 28 |
+
type: AdamW
|
| 29 |
+
params:
|
| 30 |
+
-
|
| 31 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 32 |
+
lr: 0.0000025
|
| 33 |
+
-
|
| 34 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 35 |
+
weight_decay: 0.
|
| 36 |
+
|
| 37 |
+
lr: 0.00025
|
| 38 |
+
betas: [0.9, 0.999]
|
| 39 |
+
weight_decay: 0.000125
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
epoches: 36 # Early stop
|
| 43 |
+
train_dataloader:
|
| 44 |
+
dataset:
|
| 45 |
+
transforms:
|
| 46 |
+
policy:
|
| 47 |
+
epoch: 30
|
| 48 |
+
collate_fn:
|
| 49 |
+
stop_epoch: 30
|
| 50 |
+
ema_restart_decay: 0.9999
|
| 51 |
+
base_size_repeat: 3
|
| 52 |
+
|
| 53 |
+
ema:
|
| 54 |
+
warmups: 0
|
| 55 |
+
|
| 56 |
+
lr_warmup_scheduler:
|
| 57 |
+
warmup_duration: 0
|
configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./rtdetrv2_r101vd_6x_coco.yml',
|
| 3 |
+
'../base/rt_deim.yml',
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 13 |
+
lr: 0.000002
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0002
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.0001
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# change part
|
| 24 |
+
epoches: 60
|
| 25 |
+
flat_epoch: 34 # 4 + 60 / 2
|
| 26 |
+
no_aug_epoch: 2
|
| 27 |
+
|
| 28 |
+
train_dataloader:
|
| 29 |
+
dataset:
|
| 30 |
+
transforms:
|
| 31 |
+
policy:
|
| 32 |
+
epoch: [4, 34, 58] # list
|
| 33 |
+
|
| 34 |
+
collate_fn:
|
| 35 |
+
mixup_epochs: [4, 34]
|
| 36 |
+
stop_epoch: 58
|
configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./rtdetrv2_r18vd_120e_coco.yml',
|
| 3 |
+
'../base/rt_deim.yml',
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 13 |
+
weight_decay: 0.
|
| 14 |
+
|
| 15 |
+
lr: 0.0002
|
| 16 |
+
betas: [0.9, 0.999]
|
| 17 |
+
weight_decay: 0.0001
|
| 18 |
+
|
| 19 |
+
# change part
|
| 20 |
+
epoches: 120
|
| 21 |
+
flat_epoch: 64 # 4 + 120 / 2
|
| 22 |
+
no_aug_epoch: 3
|
| 23 |
+
|
| 24 |
+
train_dataloader:
|
| 25 |
+
dataset:
|
| 26 |
+
transforms:
|
| 27 |
+
policy:
|
| 28 |
+
epoch: [4, 64, 117] # list
|
| 29 |
+
|
| 30 |
+
collate_fn:
|
| 31 |
+
mixup_epochs: [4, 64]
|
| 32 |
+
stop_epoch: 117
|
configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./rtdetrv2_r34vd_120e_coco.yml',
|
| 3 |
+
'../base/rt_deim.yml',
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 13 |
+
lr: 0.0001
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0002
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.0001
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# change part
|
| 24 |
+
epoches: 120
|
| 25 |
+
flat_epoch: 64
|
| 26 |
+
no_aug_epoch: 3
|
| 27 |
+
|
| 28 |
+
train_dataloader:
|
| 29 |
+
dataset:
|
| 30 |
+
transforms:
|
| 31 |
+
policy:
|
| 32 |
+
epoch: [4, 64, 117] # list
|
| 33 |
+
|
| 34 |
+
collate_fn:
|
| 35 |
+
mixup_epochs: [4, 64]
|
| 36 |
+
stop_epoch: 117
|
configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./rtdetrv2_r50vd_6x_coco.yml',
|
| 3 |
+
'../base/rt_deim.yml',
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco
|
| 7 |
+
|
| 8 |
+
optimizer:
|
| 9 |
+
type: AdamW
|
| 10 |
+
params:
|
| 11 |
+
-
|
| 12 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 13 |
+
lr: 0.00002
|
| 14 |
+
-
|
| 15 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 16 |
+
weight_decay: 0.
|
| 17 |
+
|
| 18 |
+
lr: 0.0002
|
| 19 |
+
betas: [0.9, 0.999]
|
| 20 |
+
weight_decay: 0.0001
|
| 21 |
+
|
| 22 |
+
# change part
|
| 23 |
+
epoches: 60
|
| 24 |
+
flat_epoch: 34 # 4 + 60 / 2
|
| 25 |
+
no_aug_epoch: 2
|
| 26 |
+
|
| 27 |
+
train_dataloader:
|
| 28 |
+
dataset:
|
| 29 |
+
transforms:
|
| 30 |
+
policy:
|
| 31 |
+
epoch: [4, 34, 58] # list
|
| 32 |
+
|
| 33 |
+
collate_fn:
|
| 34 |
+
mixup_epochs: [4, 34]
|
| 35 |
+
stop_epoch: 58
|
configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'./rtdetrv2_r50vd_m_7x_coco.yml',
|
| 3 |
+
'../base/rt_deim.yml',
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco
|
| 7 |
+
|
| 8 |
+
RTDETRTransformerv2:
|
| 9 |
+
eval_idx: 2 # use 3th decoder layer to eval
|
| 10 |
+
num_layers: 3
|
| 11 |
+
|
| 12 |
+
optimizer:
|
| 13 |
+
type: AdamW
|
| 14 |
+
params:
|
| 15 |
+
-
|
| 16 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 17 |
+
lr: 0.00002
|
| 18 |
+
-
|
| 19 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 20 |
+
weight_decay: 0.
|
| 21 |
+
|
| 22 |
+
lr: 0.0002
|
| 23 |
+
betas: [0.9, 0.999]
|
| 24 |
+
weight_decay: 0.0001
|
| 25 |
+
|
| 26 |
+
# change part
|
| 27 |
+
epoches: 60
|
| 28 |
+
flat_epoch: 34 # 4 + 60 / 2
|
| 29 |
+
no_aug_epoch: 2
|
| 30 |
+
|
| 31 |
+
train_dataloader:
|
| 32 |
+
dataset:
|
| 33 |
+
transforms:
|
| 34 |
+
policy:
|
| 35 |
+
epoch: [4, 34, 58] # list
|
| 36 |
+
|
| 37 |
+
collate_fn:
|
| 38 |
+
mixup_epochs: [4, 34]
|
| 39 |
+
stop_epoch: 58
|
configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/rt_optimizer.yml',
|
| 6 |
+
'../base/rtdetrv2_r50vd.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./outputs/rtdetrv2_r101vd_6x_coco
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
PResNet:
|
| 14 |
+
depth: 101
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
HybridEncoder:
|
| 18 |
+
# intra
|
| 19 |
+
hidden_dim: 384
|
| 20 |
+
dim_feedforward: 2048
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
RTDETRTransformerv2:
|
| 24 |
+
feat_channels: [384, 384, 384]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
optimizer:
|
| 28 |
+
type: AdamW
|
| 29 |
+
params:
|
| 30 |
+
-
|
| 31 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 32 |
+
lr: 0.000001
|
| 33 |
+
-
|
| 34 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # only encoder + decoder norm
|
| 35 |
+
weight_decay: 0.
|
| 36 |
+
|
| 37 |
+
lr: 0.0001
|
| 38 |
+
betas: [0.9, 0.999]
|
| 39 |
+
weight_decay: 0.0001
|
| 40 |
+
|
configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/rt_optimizer.yml',
|
| 6 |
+
'../base/rtdetrv2_r50vd.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./output/rtdetrv2_r18vd_120e_coco
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
PResNet:
|
| 14 |
+
depth: 18
|
| 15 |
+
freeze_at: -1
|
| 16 |
+
freeze_norm: False
|
| 17 |
+
pretrained: True
|
| 18 |
+
|
| 19 |
+
HybridEncoder:
|
| 20 |
+
in_channels: [128, 256, 512]
|
| 21 |
+
hidden_dim: 256
|
| 22 |
+
expansion: 0.5
|
| 23 |
+
|
| 24 |
+
RTDETRTransformerv2:
|
| 25 |
+
num_layers: 3
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
epoches: 120
|
| 29 |
+
|
| 30 |
+
optimizer:
|
| 31 |
+
type: AdamW
|
| 32 |
+
params:
|
| 33 |
+
-
|
| 34 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 35 |
+
weight_decay: 0.
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
train_dataloader:
|
| 39 |
+
dataset:
|
| 40 |
+
transforms:
|
| 41 |
+
policy:
|
| 42 |
+
epoch: 117
|
| 43 |
+
collate_fn:
|
| 44 |
+
scales: ~
|
configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/rt_optimizer.yml',
|
| 6 |
+
'../base/rtdetrv2_r50vd.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./outputs/rtdetrv2_r34vd_120e_coco
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
PResNet:
|
| 14 |
+
depth: 34
|
| 15 |
+
freeze_at: -1
|
| 16 |
+
freeze_norm: False
|
| 17 |
+
pretrained: True
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
HybridEncoder:
|
| 21 |
+
in_channels: [128, 256, 512]
|
| 22 |
+
hidden_dim: 256
|
| 23 |
+
expansion: 0.5
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
RTDETRTransformerv2:
|
| 27 |
+
num_layers: 4
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
epoches: 120
|
| 31 |
+
|
| 32 |
+
optimizer:
|
| 33 |
+
type: AdamW
|
| 34 |
+
params:
|
| 35 |
+
-
|
| 36 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 37 |
+
lr: 0.00005
|
| 38 |
+
-
|
| 39 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 40 |
+
lr: 0.00005
|
| 41 |
+
weight_decay: 0.
|
| 42 |
+
-
|
| 43 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 44 |
+
weight_decay: 0.
|
| 45 |
+
|
| 46 |
+
lr: 0.0001
|
| 47 |
+
betas: [0.9, 0.999]
|
| 48 |
+
weight_decay: 0.0001
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
train_dataloader:
|
| 52 |
+
dataset:
|
| 53 |
+
transforms:
|
| 54 |
+
policy:
|
| 55 |
+
epoch: 117
|
| 56 |
+
collate_fn:
|
| 57 |
+
stop_epoch: 117
|
configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/rt_optimizer.yml',
|
| 6 |
+
'../base/rtdetrv2_r50vd.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./outputs/rtdetrv2_r50vd_6x_coco
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
optimizer:
|
| 14 |
+
type: AdamW
|
| 15 |
+
params:
|
| 16 |
+
-
|
| 17 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 18 |
+
lr: 0.00001
|
| 19 |
+
-
|
| 20 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 21 |
+
weight_decay: 0.
|
| 22 |
+
|
| 23 |
+
lr: 0.0001
|
| 24 |
+
betas: [0.9, 0.999]
|
| 25 |
+
weight_decay: 0.0001
|
configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/rt_optimizer.yml',
|
| 6 |
+
'../base/rtdetrv2_r50vd.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
HybridEncoder:
|
| 13 |
+
expansion: 0.5
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
RTDETRTransformerv2:
|
| 17 |
+
eval_idx: 2 # use 3th decoder layer to eval
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
epoches: 84
|
| 21 |
+
|
| 22 |
+
optimizer:
|
| 23 |
+
type: AdamW
|
| 24 |
+
params:
|
| 25 |
+
-
|
| 26 |
+
params: '^(?=.*backbone)(?!.*norm).*$'
|
| 27 |
+
lr: 0.00001
|
| 28 |
+
-
|
| 29 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 30 |
+
weight_decay: 0.
|
| 31 |
+
|
| 32 |
+
lr: 0.0001
|
| 33 |
+
betas: [0.9, 0.999]
|
| 34 |
+
weight_decay: 0.0001
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
train_dataloader:
|
| 38 |
+
dataset:
|
| 39 |
+
transforms:
|
| 40 |
+
policy:
|
| 41 |
+
epoch: 81
|
| 42 |
+
collate_fn:
|
| 43 |
+
stop_epoch: 81
|
configs/deimv2/deimv2_dinov3_l_coco.yml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./outputs/deimv2_dinov3_l_coco
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: DINOv3STAs
|
| 14 |
+
|
| 15 |
+
DINOv3STAs:
|
| 16 |
+
name: dinov3_vits16
|
| 17 |
+
weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth
|
| 18 |
+
interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32]
|
| 19 |
+
finetune: True
|
| 20 |
+
conv_inplane: 32
|
| 21 |
+
hidden_dim: 224
|
| 22 |
+
|
| 23 |
+
HybridEncoder:
|
| 24 |
+
in_channels: [224, 224, 224]
|
| 25 |
+
hidden_dim: 224
|
| 26 |
+
dim_feedforward: 896
|
| 27 |
+
|
| 28 |
+
DEIMTransformer:
|
| 29 |
+
feat_channels: [224, 224, 224]
|
| 30 |
+
hidden_dim: 224
|
| 31 |
+
num_layers: 4
|
| 32 |
+
eval_idx: -1
|
| 33 |
+
dim_feedforward: 1792
|
| 34 |
+
|
| 35 |
+
## DEIM LR-Scheduler
|
| 36 |
+
epoches: 68 # 72 + 2n # Increase to search for the optimal ema
|
| 37 |
+
|
| 38 |
+
lrsheduler: flatcosine
|
| 39 |
+
lr_gamma: 0.5
|
| 40 |
+
warmup_iter: 2000
|
| 41 |
+
flat_epoch: 34 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 42 |
+
no_aug_epoch: 8
|
| 43 |
+
|
| 44 |
+
## Optimizer
|
| 45 |
+
optimizer:
|
| 46 |
+
type: AdamW
|
| 47 |
+
params:
|
| 48 |
+
-
|
| 49 |
+
# except norm/bn/bias in self.dinov3
|
| 50 |
+
params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
|
| 51 |
+
lr: 0.0000125
|
| 52 |
+
-
|
| 53 |
+
# including norm/bn/bias in self.dinov3
|
| 54 |
+
params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
|
| 55 |
+
lr: 0.0000125
|
| 56 |
+
weight_decay: 0.
|
| 57 |
+
-
|
| 58 |
+
# including norm/bn/bias except for the self.dinov3
|
| 59 |
+
params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 60 |
+
weight_decay: 0.
|
| 61 |
+
|
| 62 |
+
lr: 0.0005
|
| 63 |
+
betas: [0.9, 0.999]
|
| 64 |
+
weight_decay: 0.000125
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
## Dense O2O: Mosaic + Mixup + CopyBlend
|
| 68 |
+
train_dataloader:
|
| 69 |
+
dataset:
|
| 70 |
+
transforms:
|
| 71 |
+
ops:
|
| 72 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 73 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 74 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 75 |
+
- {type: RandomZoomOut, fill: 0}
|
| 76 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 77 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 78 |
+
- {type: RandomHorizontalFlip}
|
| 79 |
+
- {type: Resize, size: [640, 640], }
|
| 80 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 81 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 82 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 83 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 84 |
+
policy:
|
| 85 |
+
epoch: [4, 34, 60] # list
|
| 86 |
+
|
| 87 |
+
collate_fn:
|
| 88 |
+
mixup_epochs: [4, 34]
|
| 89 |
+
stop_epoch: 60
|
| 90 |
+
copyblend_epochs: [4, 60]
|
| 91 |
+
base_size_repeat: 3
|
| 92 |
+
|
| 93 |
+
val_dataloader:
|
| 94 |
+
dataset:
|
| 95 |
+
transforms:
|
| 96 |
+
ops:
|
| 97 |
+
- {type: Resize, size: [640, 640], }
|
| 98 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 99 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 100 |
+
|
| 101 |
+
## DEIM Loss
|
| 102 |
+
DEIMCriterion:
|
| 103 |
+
matcher:
|
| 104 |
+
matcher_change_epoch: 50
|
configs/deimv2/deimv2_dinov3_m_coco.yml
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_dinov3_m_coco
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
backbone: DINOv3STAs
|
| 13 |
+
|
| 14 |
+
DINOv3STAs:
|
| 15 |
+
name: vit_tinyplus
|
| 16 |
+
embed_dim: 256
|
| 17 |
+
weights_path: ./ckpts/vittplus_distill.pt
|
| 18 |
+
interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32]
|
| 19 |
+
num_heads: 4
|
| 20 |
+
|
| 21 |
+
HybridEncoder:
|
| 22 |
+
in_channels: [256, 256, 256]
|
| 23 |
+
depth_mult: 1
|
| 24 |
+
expansion: 0.67
|
| 25 |
+
hidden_dim: 256
|
| 26 |
+
dim_feedforward: 512
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
DEIMTransformer:
|
| 30 |
+
feat_channels: [256, 256, 256]
|
| 31 |
+
hidden_dim: 256
|
| 32 |
+
dim_feedforward: 512
|
| 33 |
+
num_layers: 4 # 4 5 6
|
| 34 |
+
eval_idx: -1 # -2 -3 -4
|
| 35 |
+
|
| 36 |
+
optimizer:
|
| 37 |
+
type: AdamW
|
| 38 |
+
|
| 39 |
+
params:
|
| 40 |
+
-
|
| 41 |
+
# except norm/bn/bias in self.dinov3
|
| 42 |
+
params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
|
| 43 |
+
lr: 0.000025
|
| 44 |
+
-
|
| 45 |
+
# including norm/bn/bias in self.dinov3
|
| 46 |
+
params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
|
| 47 |
+
lr: 0.000025
|
| 48 |
+
weight_decay: 0.
|
| 49 |
+
-
|
| 50 |
+
# including norm/bn/bias except for the self.dinov3
|
| 51 |
+
params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 52 |
+
weight_decay: 0.
|
| 53 |
+
|
| 54 |
+
lr: 0.0005
|
| 55 |
+
betas: [0.9, 0.999]
|
| 56 |
+
weight_decay: 0.0001
|
| 57 |
+
|
| 58 |
+
epoches: 102 # 120 + 4n
|
| 59 |
+
|
| 60 |
+
## Our LR-Scheduler
|
| 61 |
+
flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 62 |
+
no_aug_epoch: 12
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
## Our DataAug
|
| 66 |
+
train_dataloader:
|
| 67 |
+
dataset:
|
| 68 |
+
transforms:
|
| 69 |
+
ops:
|
| 70 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 71 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 72 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 73 |
+
- {type: RandomZoomOut, fill: 0}
|
| 74 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 75 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 76 |
+
- {type: RandomHorizontalFlip}
|
| 77 |
+
- {type: Resize, size: [640, 640], }
|
| 78 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 79 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 80 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 81 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 82 |
+
policy:
|
| 83 |
+
epoch: [4, 49, 90] # list
|
| 84 |
+
|
| 85 |
+
collate_fn:
|
| 86 |
+
mixup_prob: 0.5
|
| 87 |
+
ema_restart_decay: 0.9999
|
| 88 |
+
base_size_repeat: 6
|
| 89 |
+
mixup_epochs: [4, 49]
|
| 90 |
+
stop_epoch: 90
|
| 91 |
+
copyblend_epochs: [4, 90]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
val_dataloader:
|
| 95 |
+
dataset:
|
| 96 |
+
transforms:
|
| 97 |
+
ops:
|
| 98 |
+
- {type: Resize, size: [640, 640], }
|
| 99 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 100 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 101 |
+
|
| 102 |
+
DEIMCriterion:
|
| 103 |
+
matcher:
|
| 104 |
+
# new matcher
|
| 105 |
+
change_matcher: True
|
| 106 |
+
iou_order_alpha: 4.0
|
| 107 |
+
matcher_change_epoch: 80
|
configs/{deimv2_floorplan.yaml → deimv2/deimv2_dinov3_s_coco.yml}
RENAMED
|
@@ -1,58 +1,21 @@
|
|
| 1 |
__include__: [
|
| 2 |
-
'coco_detection.yml',
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
]
|
| 8 |
|
| 9 |
-
output_dir: ./outputs/
|
| 10 |
|
| 11 |
-
# モデル定義(engine/core.pyが参照する)
|
| 12 |
-
model:
|
| 13 |
-
type: DEIM
|
| 14 |
-
backbone:
|
| 15 |
-
type: DINOv3STAs
|
| 16 |
-
name: vit_tiny
|
| 17 |
-
weights_path: ./ckpts/vitt_distill.pt
|
| 18 |
-
interaction_indexes: [3, 7, 11]
|
| 19 |
-
num_heads: 3
|
| 20 |
-
embed_dim: 192
|
| 21 |
-
encoder:
|
| 22 |
-
type: HybridEncoder
|
| 23 |
-
in_channels: [192, 192, 192]
|
| 24 |
-
depth_mult: 0.67
|
| 25 |
-
expansion: 0.34
|
| 26 |
-
hidden_dim: 192
|
| 27 |
-
dim_feedforward: 512
|
| 28 |
-
decoder:
|
| 29 |
-
type: DEIMTransformer
|
| 30 |
-
feat_channels: [192, 192, 192]
|
| 31 |
-
hidden_dim: 192
|
| 32 |
-
dim_feedforward: 512
|
| 33 |
-
num_layers: 4 # 4 5 6
|
| 34 |
-
eval_idx: -1 # -2 -3 -4
|
| 35 |
-
|
| 36 |
-
# ポストプロセッサ定義(engine/core.pyが参照する)
|
| 37 |
-
postprocessor:
|
| 38 |
-
type: PostProcessor
|
| 39 |
-
|
| 40 |
-
# 互換性のため残す(必要に応じて)
|
| 41 |
DEIM:
|
| 42 |
backbone: DINOv3STAs
|
| 43 |
|
| 44 |
-
Model:
|
| 45 |
-
num_classes: 16
|
| 46 |
-
class_names: ["kanki", "kanki_shikaku", "kanki_regisuta", "window1", "window2", "door1", "door2", "bathtub1", "konro1", "sink1", "toilet1", "kasaikeihou1", "kasaikeihou2", "houi1", "houi2", "houi3"]
|
| 47 |
-
|
| 48 |
-
# eval_spatial_sizeを明示的に設定(推論時の画像サイズ)
|
| 49 |
-
eval_spatial_size: [640, 640]
|
| 50 |
-
|
| 51 |
DINOv3STAs:
|
| 52 |
name: vit_tiny
|
| 53 |
embed_dim: 192
|
| 54 |
-
weights_path: ./ckpts/vitt_distill.pt
|
| 55 |
-
interaction_indexes: [3, 7, 11]
|
| 56 |
num_heads: 3
|
| 57 |
|
| 58 |
HybridEncoder:
|
|
@@ -93,97 +56,53 @@ optimizer:
|
|
| 93 |
betas: [0.9, 0.999]
|
| 94 |
weight_decay: 0.0001
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
no_aug_epoch: 46
|
| 99 |
-
|
| 100 |
-
# optimizer.ymlから必要な設定を手動で追加
|
| 101 |
-
use_amp: True
|
| 102 |
-
use_ema: True
|
| 103 |
-
ema:
|
| 104 |
-
type: ModelEMA
|
| 105 |
-
decay: 0.9999
|
| 106 |
-
warmups: 1000
|
| 107 |
-
start: 0
|
| 108 |
-
|
| 109 |
-
clip_max_norm: 0.1
|
| 110 |
-
sync_bn: True
|
| 111 |
-
find_unused_parameters: True
|
| 112 |
-
|
| 113 |
-
# 学習率スケジューリング設定
|
| 114 |
-
# CosineAnnealingLR専用設定(パラメータを最小限に)
|
| 115 |
-
lr_scheduler:
|
| 116 |
-
type: CosineAnnealingLR
|
| 117 |
-
T_max: 400
|
| 118 |
-
eta_min: 0.0000001
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
|
| 124 |
-
|
| 125 |
-
lrsheduler: null
|
| 126 |
-
|
| 127 |
-
# deimv2.ymlのflatcosineスケジューラーも無効化
|
| 128 |
-
lr_gamma: null
|
| 129 |
-
warmup_iter: null
|
| 130 |
-
flat_epoch: null
|
| 131 |
-
no_aug_epoch: null
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# ---- Data Aug / Loader(図面+640px+OOM対策)----
|
| 135 |
train_dataloader:
|
| 136 |
-
dataset:
|
| 137 |
transforms:
|
| 138 |
ops:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
max_cached_images: 20, random_pop: True}
|
| 143 |
-
- {type: RandomPhotometricDistort, p: 0.2}
|
| 144 |
- {type: RandomZoomOut, fill: 0}
|
| 145 |
-
- {type: RandomIoUCrop, p: 0.
|
| 146 |
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 147 |
- {type: RandomHorizontalFlip}
|
| 148 |
-
- {type:
|
| 149 |
-
- {type: Resize, size: [640, 640]} # ★ 640固定
|
| 150 |
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 151 |
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 152 |
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 153 |
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 154 |
policy:
|
| 155 |
-
epoch: [
|
| 156 |
|
| 157 |
-
collate_fn:
|
|
|
|
|
|
|
| 158 |
ema_restart_decay: 0.9999
|
| 159 |
-
base_size_repeat:
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# 実装が読む場合のみ有効。読まない場合は base/dataloader.yml ��起動引数で制御
|
| 164 |
-
total_batch_size: 4 # ★ まずは 4 に落として安定化
|
| 165 |
|
| 166 |
val_dataloader:
|
| 167 |
dataset:
|
| 168 |
transforms:
|
| 169 |
ops:
|
| 170 |
-
- {type: Resize, size: [640, 640]}
|
| 171 |
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 172 |
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 173 |
-
total_batch_size: 6 # 評価も同程度に
|
| 174 |
|
| 175 |
DEIMCriterion:
|
| 176 |
matcher:
|
|
|
|
| 177 |
change_matcher: True
|
| 178 |
iou_order_alpha: 4.0
|
| 179 |
-
matcher_change_epoch:
|
| 180 |
-
gamma: 1.5
|
| 181 |
-
alpha: 0.75
|
| 182 |
-
weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
|
| 183 |
-
losses: [mal, boxes, local]
|
| 184 |
-
|
| 185 |
-
# 出力設定 - 最後のエポック必ず保存
|
| 186 |
-
output:
|
| 187 |
-
save_last: true
|
| 188 |
-
save_interval: 5 # チェックポイント保存間隔
|
| 189 |
-
checkpoint_freq: 5 # 学習ループでの保存頻度
|
|
|
|
| 1 |
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
]
|
| 8 |
|
| 9 |
+
output_dir: ./outputs/deimv2_dinov3_s_coco
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
DEIM:
|
| 12 |
backbone: DINOv3STAs
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
DINOv3STAs:
|
| 15 |
name: vit_tiny
|
| 16 |
embed_dim: 192
|
| 17 |
+
weights_path: ./ckpts/vitt_distill.pt
|
| 18 |
+
interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32]
|
| 19 |
num_heads: 3
|
| 20 |
|
| 21 |
HybridEncoder:
|
|
|
|
| 56 |
betas: [0.9, 0.999]
|
| 57 |
weight_decay: 0.0001
|
| 58 |
|
| 59 |
+
# Increase to search for the optimal ema
|
| 60 |
+
epoches: 132 # 120 + 4n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
## Our LR-Scheduler
|
| 63 |
+
flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 64 |
+
no_aug_epoch: 12
|
| 65 |
|
| 66 |
+
## Our DataAug
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
train_dataloader:
|
| 68 |
+
dataset:
|
| 69 |
transforms:
|
| 70 |
ops:
|
| 71 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 72 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 73 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
|
|
|
|
|
|
| 74 |
- {type: RandomZoomOut, fill: 0}
|
| 75 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 76 |
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 77 |
- {type: RandomHorizontalFlip}
|
| 78 |
+
- {type: Resize, size: [640, 640], }
|
|
|
|
| 79 |
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 80 |
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 81 |
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 82 |
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 83 |
policy:
|
| 84 |
+
epoch: [4, 64, 120] # list
|
| 85 |
|
| 86 |
+
collate_fn:
|
| 87 |
+
base_size: 640
|
| 88 |
+
mixup_prob: 0.5
|
| 89 |
ema_restart_decay: 0.9999
|
| 90 |
+
base_size_repeat: 20
|
| 91 |
+
mixup_epochs: [4, 64]
|
| 92 |
+
stop_epoch: 120
|
| 93 |
+
copyblend_epochs: [4, 120]
|
|
|
|
|
|
|
| 94 |
|
| 95 |
val_dataloader:
|
| 96 |
dataset:
|
| 97 |
transforms:
|
| 98 |
ops:
|
| 99 |
+
- {type: Resize, size: [640, 640], }
|
| 100 |
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 101 |
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
|
|
|
| 102 |
|
| 103 |
DEIMCriterion:
|
| 104 |
matcher:
|
| 105 |
+
# change matcher
|
| 106 |
change_matcher: True
|
| 107 |
iou_order_alpha: 4.0
|
| 108 |
+
matcher_change_epoch: 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/deimv2/deimv2_dinov3_x_coco.yml
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
output_dir: ./outputs/deimv2_dinov3_x_coco
|
| 11 |
+
|
| 12 |
+
DEIM:
|
| 13 |
+
backbone: DINOv3STAs
|
| 14 |
+
|
| 15 |
+
DINOv3STAs:
|
| 16 |
+
name: dinov3_vits16plus
|
| 17 |
+
weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth
|
| 18 |
+
interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32]
|
| 19 |
+
finetune: True
|
| 20 |
+
conv_inplane: 64
|
| 21 |
+
hidden_dim: 256
|
| 22 |
+
|
| 23 |
+
HybridEncoder:
|
| 24 |
+
in_channels: [256, 256, 256]
|
| 25 |
+
# intra
|
| 26 |
+
hidden_dim: 256
|
| 27 |
+
dim_feedforward: 1024
|
| 28 |
+
|
| 29 |
+
# cross
|
| 30 |
+
expansion: 1.25
|
| 31 |
+
depth_mult: 1.37
|
| 32 |
+
|
| 33 |
+
DEIMTransformer:
|
| 34 |
+
num_layers: 6
|
| 35 |
+
eval_idx: -1
|
| 36 |
+
feat_channels: [256, 256, 256]
|
| 37 |
+
# reg_scale: 8
|
| 38 |
+
hidden_dim: 256
|
| 39 |
+
dim_feedforward: 2048
|
| 40 |
+
|
| 41 |
+
optimizer:
|
| 42 |
+
type: AdamW
|
| 43 |
+
params:
|
| 44 |
+
-
|
| 45 |
+
# except norm/bn/bias in self.dinov3
|
| 46 |
+
params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'
|
| 47 |
+
lr: 0.00001
|
| 48 |
+
-
|
| 49 |
+
# including norm/bn/bias in self.dinov3
|
| 50 |
+
params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'
|
| 51 |
+
lr: 0.00001
|
| 52 |
+
weight_decay: 0.
|
| 53 |
+
-
|
| 54 |
+
# including norm/bn/bias except for the self.dinov3
|
| 55 |
+
params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 56 |
+
weight_decay: 0.
|
| 57 |
+
|
| 58 |
+
lr: 0.0005
|
| 59 |
+
betas: [0.9, 0.999]
|
| 60 |
+
weight_decay: 0.000125
|
| 61 |
+
|
| 62 |
+
## Dense O2O: Mosaic + Mixup + CopyBlend
|
| 63 |
+
train_dataloader:
|
| 64 |
+
dataset:
|
| 65 |
+
transforms:
|
| 66 |
+
ops:
|
| 67 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 68 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 69 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 70 |
+
- {type: RandomZoomOut, fill: 0}
|
| 71 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 72 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 73 |
+
- {type: RandomHorizontalFlip}
|
| 74 |
+
- {type: Resize, size: [640, 640], }
|
| 75 |
+
- {type: SanitizeBoundingBoxes, min_size: 1}
|
| 76 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 77 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
| 78 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 79 |
+
policy:
|
| 80 |
+
epoch: [4, 29, 50] # list
|
| 81 |
+
|
| 82 |
+
collate_fn:
|
| 83 |
+
mixup_epochs: [4, 29]
|
| 84 |
+
stop_epoch: 50
|
| 85 |
+
copyblend_epochs: [4, 50]
|
| 86 |
+
base_size_repeat: 3
|
| 87 |
+
|
| 88 |
+
val_dataloader:
|
| 89 |
+
dataset:
|
| 90 |
+
transforms:
|
| 91 |
+
ops:
|
| 92 |
+
- {type: Resize, size: [640, 640], }
|
| 93 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 94 |
+
- {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
|
configs/deimv2/deimv2_hgnetv2_atto_coco.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_atto_coco
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
encoder: LiteEncoder
|
| 13 |
+
|
| 14 |
+
HGNetv2:
|
| 15 |
+
name: 'Atto'
|
| 16 |
+
return_idx: [2]
|
| 17 |
+
freeze_at: -1
|
| 18 |
+
freeze_norm: False
|
| 19 |
+
use_lab: True
|
| 20 |
+
|
| 21 |
+
LiteEncoder:
|
| 22 |
+
in_channels: [256]
|
| 23 |
+
feat_strides: [16]
|
| 24 |
+
# intra
|
| 25 |
+
hidden_dim: 64
|
| 26 |
+
|
| 27 |
+
# cross
|
| 28 |
+
expansion: 0.34
|
| 29 |
+
depth_mult: 0.5
|
| 30 |
+
act: 'silu'
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
DEIMTransformer:
|
| 34 |
+
feat_channels: [64, 64]
|
| 35 |
+
feat_strides: [16, 32]
|
| 36 |
+
hidden_dim: 64
|
| 37 |
+
num_levels: 2
|
| 38 |
+
num_points: [4, 2]
|
| 39 |
+
|
| 40 |
+
num_layers: 3
|
| 41 |
+
eval_idx: -1
|
| 42 |
+
num_queries: 100
|
| 43 |
+
|
| 44 |
+
# FFN
|
| 45 |
+
dim_feedforward: 160
|
| 46 |
+
|
| 47 |
+
# New options for DEIMv2
|
| 48 |
+
share_bbox_head: True
|
| 49 |
+
use_gateway: False
|
| 50 |
+
|
| 51 |
+
# Increase to search for the optimal ema
|
| 52 |
+
epoches: 500 # 468 + 32
|
| 53 |
+
|
| 54 |
+
## Our LR-Scheduler
|
| 55 |
+
warmup_iter: 4000
|
| 56 |
+
flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 57 |
+
no_aug_epoch: 32
|
| 58 |
+
lr_gamma: 0.5
|
| 59 |
+
|
| 60 |
+
optimizer:
|
| 61 |
+
type: AdamW
|
| 62 |
+
params:
|
| 63 |
+
- params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 64 |
+
lr: 0.001
|
| 65 |
+
- params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 66 |
+
lr: 0.001
|
| 67 |
+
weight_decay: 0.
|
| 68 |
+
- params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # except bias
|
| 69 |
+
weight_decay: 0.
|
| 70 |
+
|
| 71 |
+
lr: 0.002
|
| 72 |
+
betas: [0.9, 0.999]
|
| 73 |
+
weight_decay: 0.0001
|
| 74 |
+
|
| 75 |
+
eval_spatial_size: [320, 320]
|
| 76 |
+
train_dataloader:
|
| 77 |
+
total_batch_size: 128
|
| 78 |
+
dataset:
|
| 79 |
+
transforms:
|
| 80 |
+
ops:
|
| 81 |
+
- {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 82 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 83 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 84 |
+
- {type: RandomZoomOut, fill: 0}
|
| 85 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 86 |
+
- {type: SanitizeBoundingBoxes, min_size: 12}
|
| 87 |
+
- {type: RandomHorizontalFlip}
|
| 88 |
+
- {type: Resize, size: [320, 320], }
|
| 89 |
+
- {type: SanitizeBoundingBoxes, min_size: 12}
|
| 90 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 91 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 92 |
+
policy:
|
| 93 |
+
epoch: [4, 250, 400] # list
|
| 94 |
+
mosaic_prob: 0.3
|
| 95 |
+
|
| 96 |
+
collate_fn:
|
| 97 |
+
mixup_prob: 0.0
|
| 98 |
+
mixup_epochs: [40000, 15000]
|
| 99 |
+
copyblend_prob: 0.0
|
| 100 |
+
copyblend_epochs: [40000, 15000]
|
| 101 |
+
|
| 102 |
+
stop_epoch: 468 # 468 + 32
|
| 103 |
+
ema_restart_decay: 0.9999
|
| 104 |
+
base_size: 320
|
| 105 |
+
base_size_repeat: ~
|
| 106 |
+
|
| 107 |
+
val_dataloader:
|
| 108 |
+
total_batch_size: 256
|
| 109 |
+
dataset:
|
| 110 |
+
transforms:
|
| 111 |
+
ops:
|
| 112 |
+
- {type: Resize, size: [320, 320], }
|
| 113 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 114 |
+
shuffle: False
|
| 115 |
+
num_workers: 16
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
DEIMCriterion:
|
| 119 |
+
losses: ['mal', 'boxes'] # , 'local'
|
| 120 |
+
use_uni_set: False
|
| 121 |
+
|
| 122 |
+
matcher:
|
| 123 |
+
matcher_change_epoch: 450 # FIX This
|
configs/deimv2/deimv2_hgnetv2_femto_coco.yml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_femto_coco
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
encoder: LiteEncoder
|
| 13 |
+
|
| 14 |
+
HGNetv2:
|
| 15 |
+
name: 'Femto'
|
| 16 |
+
return_idx: [2]
|
| 17 |
+
freeze_at: -1
|
| 18 |
+
freeze_norm: False
|
| 19 |
+
use_lab: True
|
| 20 |
+
|
| 21 |
+
LiteEncoder:
|
| 22 |
+
in_channels: [512]
|
| 23 |
+
feat_strides: [16]
|
| 24 |
+
|
| 25 |
+
# intra
|
| 26 |
+
hidden_dim: 96
|
| 27 |
+
|
| 28 |
+
# cross
|
| 29 |
+
expansion: 0.34
|
| 30 |
+
depth_mult: 0.5
|
| 31 |
+
act: 'silu'
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
DEIMTransformer:
|
| 35 |
+
feat_channels: [96, 96]
|
| 36 |
+
feat_strides: [16, 32]
|
| 37 |
+
hidden_dim: 96
|
| 38 |
+
num_levels: 2
|
| 39 |
+
num_points: [4, 2]
|
| 40 |
+
|
| 41 |
+
num_layers: 3
|
| 42 |
+
eval_idx: -1
|
| 43 |
+
num_queries: 150
|
| 44 |
+
|
| 45 |
+
# FFN
|
| 46 |
+
dim_feedforward: 256
|
| 47 |
+
|
| 48 |
+
# New options for DEIMv2
|
| 49 |
+
share_bbox_head: True
|
| 50 |
+
use_gateway: False
|
| 51 |
+
|
| 52 |
+
# Increase to search for the optimal ema
|
| 53 |
+
epoches: 500 # 468 + 32
|
| 54 |
+
|
| 55 |
+
## Our LR-Scheduler
|
| 56 |
+
warmup_iter: 4000
|
| 57 |
+
flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 58 |
+
no_aug_epoch: 32
|
| 59 |
+
lr_gamma: 0.5
|
| 60 |
+
|
| 61 |
+
optimizer:
|
| 62 |
+
type: AdamW
|
| 63 |
+
params:
|
| 64 |
+
-
|
| 65 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 66 |
+
lr: 0.0008
|
| 67 |
+
-
|
| 68 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 69 |
+
lr: 0.0008
|
| 70 |
+
weight_decay: 0.
|
| 71 |
+
- # not opt
|
| 72 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 73 |
+
weight_decay: 0.
|
| 74 |
+
|
| 75 |
+
lr: 0.0016
|
| 76 |
+
betas: [0.9, 0.999]
|
| 77 |
+
weight_decay: 0.0001
|
| 78 |
+
|
| 79 |
+
eval_spatial_size: [416, 416]
|
| 80 |
+
train_dataloader:
|
| 81 |
+
total_batch_size: 128
|
| 82 |
+
dataset:
|
| 83 |
+
transforms:
|
| 84 |
+
ops:
|
| 85 |
+
- {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 86 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 87 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 88 |
+
- {type: RandomZoomOut, fill: 0}
|
| 89 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 90 |
+
- {type: SanitizeBoundingBoxes, min_size: 10}
|
| 91 |
+
- {type: RandomHorizontalFlip}
|
| 92 |
+
- {type: Resize, size: [416, 416], }
|
| 93 |
+
- {type: SanitizeBoundingBoxes, min_size: 10}
|
| 94 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 95 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 96 |
+
policy:
|
| 97 |
+
epoch: [4, 250, 400] # list
|
| 98 |
+
ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
|
| 99 |
+
mosaic_prob: 0.5
|
| 100 |
+
|
| 101 |
+
collate_fn:
|
| 102 |
+
mixup_prob: 0.0
|
| 103 |
+
mixup_epochs: [40000, 15000]
|
| 104 |
+
copyblend_prob: 0.0
|
| 105 |
+
copyblend_epochs: [40000, 15000]
|
| 106 |
+
|
| 107 |
+
stop_epoch: 468 # 468 + 32
|
| 108 |
+
ema_restart_decay: 0.9999
|
| 109 |
+
base_size: 416
|
| 110 |
+
base_size_repeat: ~
|
| 111 |
+
|
| 112 |
+
val_dataloader:
|
| 113 |
+
total_batch_size: 256
|
| 114 |
+
dataset:
|
| 115 |
+
transforms:
|
| 116 |
+
ops:
|
| 117 |
+
- {type: Resize, size: [416, 416], }
|
| 118 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 119 |
+
shuffle: False
|
| 120 |
+
num_workers: 16
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
DEIMCriterion:
|
| 124 |
+
losses: ['mal', 'boxes'] # , 'local'
|
| 125 |
+
use_uni_set: False
|
| 126 |
+
|
| 127 |
+
matcher:
|
| 128 |
+
matcher_change_epoch: 450 # FIX This
|
configs/deimv2/deimv2_hgnetv2_l_coco.yml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml'
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_l_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
optimizer:
|
| 13 |
+
type: AdamW
|
| 14 |
+
params:
|
| 15 |
+
-
|
| 16 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 17 |
+
lr: 0.000025
|
| 18 |
+
-
|
| 19 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 20 |
+
weight_decay: 0.
|
| 21 |
+
|
| 22 |
+
lr: 0.0005
|
| 23 |
+
betas: [0.9, 0.999]
|
| 24 |
+
weight_decay: 0.000125
|
configs/deimv2/deimv2_hgnetv2_m_coco.yml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml'
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_m_coco
|
| 10 |
+
|
| 11 |
+
HGNetv2:
|
| 12 |
+
name: 'B2'
|
| 13 |
+
return_idx: [1, 2, 3]
|
| 14 |
+
freeze_at: -1
|
| 15 |
+
freeze_norm: False
|
| 16 |
+
use_lab: True
|
| 17 |
+
|
| 18 |
+
HybridEncoder:
|
| 19 |
+
in_channels: [384, 768, 1536]
|
| 20 |
+
hidden_dim: 256
|
| 21 |
+
depth_mult: 0.67
|
| 22 |
+
|
| 23 |
+
DEIMTransformer:
|
| 24 |
+
num_layers: 4 # 5 6
|
| 25 |
+
eval_idx: -1 # -2 -3
|
| 26 |
+
|
| 27 |
+
optimizer:
|
| 28 |
+
type: AdamW
|
| 29 |
+
params:
|
| 30 |
+
-
|
| 31 |
+
params: '^(?=.*backbone)(?!.*bn).*$'
|
| 32 |
+
lr: 0.00004
|
| 33 |
+
-
|
| 34 |
+
params: '^(?=.*(?:norm|bn)).*$'
|
| 35 |
+
weight_decay: 0.
|
| 36 |
+
|
| 37 |
+
lr: 0.0004
|
| 38 |
+
betas: [0.9, 0.999]
|
| 39 |
+
weight_decay: 0.0001
|
| 40 |
+
|
| 41 |
+
# Increase to search for the optimal ema
|
| 42 |
+
epoches: 102 # 120 + 4n
|
| 43 |
+
|
| 44 |
+
## Our LR-Scheduler
|
| 45 |
+
flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 46 |
+
no_aug_epoch: 12
|
| 47 |
+
|
| 48 |
+
## Our DataAug
|
| 49 |
+
train_dataloader:
|
| 50 |
+
dataset:
|
| 51 |
+
transforms:
|
| 52 |
+
policy:
|
| 53 |
+
epoch: [4, 49, 90] # list
|
| 54 |
+
|
| 55 |
+
collate_fn:
|
| 56 |
+
ema_restart_decay: 0.9999
|
| 57 |
+
base_size_repeat: 6
|
| 58 |
+
mixup_epochs: [4, 49]
|
| 59 |
+
stop_epoch: 90
|
| 60 |
+
copyblend_prob: 0.5
|
| 61 |
+
copyblend_epochs: [4, 90]
|
| 62 |
+
area_threshold: 100
|
| 63 |
+
num_objects: 3
|
| 64 |
+
with_expand: True
|
| 65 |
+
expand_ratios: [0.1, 0.25]
|
| 66 |
+
|
| 67 |
+
DEIMCriterion:
|
| 68 |
+
matcher:
|
| 69 |
+
# new matcher
|
| 70 |
+
change_matcher: True
|
| 71 |
+
iou_order_alpha: 4.0
|
| 72 |
+
matcher_change_epoch: 80
|
configs/deimv2/deimv2_hgnetv2_n_coco.yml
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml'
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_n_coco
|
| 10 |
+
|
| 11 |
+
HGNetv2:
|
| 12 |
+
name: 'B0'
|
| 13 |
+
return_idx: [2, 3]
|
| 14 |
+
freeze_at: -1
|
| 15 |
+
freeze_norm: False
|
| 16 |
+
use_lab: True
|
| 17 |
+
|
| 18 |
+
HybridEncoder:
|
| 19 |
+
in_channels: [512, 1024]
|
| 20 |
+
feat_strides: [16, 32]
|
| 21 |
+
|
| 22 |
+
# intra
|
| 23 |
+
hidden_dim: 128
|
| 24 |
+
use_encoder_idx: [1]
|
| 25 |
+
dim_feedforward: 512
|
| 26 |
+
|
| 27 |
+
# cross
|
| 28 |
+
expansion: 0.34
|
| 29 |
+
depth_mult: 0.5
|
| 30 |
+
|
| 31 |
+
version: 'dfine'
|
| 32 |
+
|
| 33 |
+
DEIMTransformer:
|
| 34 |
+
feat_channels: [128, 128]
|
| 35 |
+
feat_strides: [16, 32]
|
| 36 |
+
hidden_dim: 128
|
| 37 |
+
num_levels: 2
|
| 38 |
+
num_points: [6, 6]
|
| 39 |
+
|
| 40 |
+
num_layers: 3
|
| 41 |
+
eval_idx: -1
|
| 42 |
+
|
| 43 |
+
# FFN
|
| 44 |
+
dim_feedforward: 512
|
| 45 |
+
|
| 46 |
+
optimizer:
|
| 47 |
+
type: AdamW
|
| 48 |
+
params:
|
| 49 |
+
-
|
| 50 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 51 |
+
lr: 0.0004
|
| 52 |
+
-
|
| 53 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 54 |
+
lr: 0.0004
|
| 55 |
+
weight_decay: 0.
|
| 56 |
+
-
|
| 57 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 58 |
+
weight_decay: 0.
|
| 59 |
+
|
| 60 |
+
lr: 0.0008
|
| 61 |
+
betas: [0.9, 0.999]
|
| 62 |
+
weight_decay: 0.0001
|
| 63 |
+
|
| 64 |
+
# Increase to search for the optimal ema
|
| 65 |
+
epoches: 160 # 148 + 12
|
| 66 |
+
|
| 67 |
+
## Our LR-Scheduler
|
| 68 |
+
flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 69 |
+
no_aug_epoch: 12
|
| 70 |
+
lr_gamma: 1.0
|
| 71 |
+
|
| 72 |
+
## Our DataAug
|
| 73 |
+
train_dataloader:
|
| 74 |
+
dataset:
|
| 75 |
+
transforms:
|
| 76 |
+
policy:
|
| 77 |
+
epoch: [4, 78, 148] # list
|
| 78 |
+
|
| 79 |
+
collate_fn:
|
| 80 |
+
ema_restart_decay: 0.9999
|
| 81 |
+
base_size_repeat: ~
|
| 82 |
+
mixup_epochs: [4, 78]
|
| 83 |
+
stop_epoch: 148
|
| 84 |
+
copyblend_prob: 0.4
|
| 85 |
+
copyblend_epochs: [4, 78] # CP half
|
| 86 |
+
area_threshold: 100
|
| 87 |
+
num_objects: 3
|
| 88 |
+
with_expand: True
|
| 89 |
+
expand_ratios: [0.1, 0.25]
|
| 90 |
+
|
| 91 |
+
DEIMCriterion:
|
| 92 |
+
matcher:
|
| 93 |
+
# new matcher
|
| 94 |
+
change_matcher: True
|
| 95 |
+
iou_order_alpha: 4.0
|
| 96 |
+
matcher_change_epoch: 136
|
configs/deimv2/deimv2_hgnetv2_pico_coco.yml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml',
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_pico_coco
|
| 10 |
+
|
| 11 |
+
DEIM:
|
| 12 |
+
encoder: LiteEncoder
|
| 13 |
+
decoder: DEIMTransformer
|
| 14 |
+
|
| 15 |
+
HGNetv2:
|
| 16 |
+
name: 'Pico'
|
| 17 |
+
return_idx: [2]
|
| 18 |
+
freeze_at: -1
|
| 19 |
+
freeze_norm: False
|
| 20 |
+
use_lab: True
|
| 21 |
+
|
| 22 |
+
LiteEncoder:
|
| 23 |
+
in_channels: [512]
|
| 24 |
+
feat_strides: [16]
|
| 25 |
+
|
| 26 |
+
# intra
|
| 27 |
+
hidden_dim: 112
|
| 28 |
+
|
| 29 |
+
# cross
|
| 30 |
+
expansion: 0.34
|
| 31 |
+
depth_mult: 0.5
|
| 32 |
+
act: 'silu'
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
DEIMTransformer:
|
| 36 |
+
feat_channels: [112, 112]
|
| 37 |
+
feat_strides: [16, 32]
|
| 38 |
+
hidden_dim: 112
|
| 39 |
+
num_levels: 2
|
| 40 |
+
num_points: [4, 2]
|
| 41 |
+
|
| 42 |
+
num_layers: 3
|
| 43 |
+
eval_idx: -1
|
| 44 |
+
num_queries: 200
|
| 45 |
+
|
| 46 |
+
# FFN
|
| 47 |
+
dim_feedforward: 320
|
| 48 |
+
|
| 49 |
+
# New options for DEIMv2
|
| 50 |
+
share_bbox_head: True
|
| 51 |
+
use_gateway: False
|
| 52 |
+
|
| 53 |
+
# Increase to search for the optimal ema
|
| 54 |
+
epoches: 500 # 468 + 32
|
| 55 |
+
|
| 56 |
+
## Our LR-Scheduler
|
| 57 |
+
warmup_iter: 4000
|
| 58 |
+
flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 59 |
+
no_aug_epoch: 32
|
| 60 |
+
lr_gamma: 0.5
|
| 61 |
+
|
| 62 |
+
optimizer:
|
| 63 |
+
type: AdamW
|
| 64 |
+
params:
|
| 65 |
+
-
|
| 66 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 67 |
+
lr: 0.0008
|
| 68 |
+
-
|
| 69 |
+
params: '^(?=.*backbone)(?=.*norm|bn).*$'
|
| 70 |
+
lr: 0.0008
|
| 71 |
+
weight_decay: 0.
|
| 72 |
+
- # not opt
|
| 73 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
|
| 74 |
+
weight_decay: 0.
|
| 75 |
+
|
| 76 |
+
lr: 0.0016
|
| 77 |
+
betas: [0.9, 0.999]
|
| 78 |
+
weight_decay: 0.0001
|
| 79 |
+
|
| 80 |
+
eval_spatial_size: [640, 640]
|
| 81 |
+
train_dataloader:
|
| 82 |
+
total_batch_size: 128
|
| 83 |
+
dataset:
|
| 84 |
+
transforms:
|
| 85 |
+
ops:
|
| 86 |
+
- {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
|
| 87 |
+
probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
|
| 88 |
+
- {type: RandomPhotometricDistort, p: 0.5}
|
| 89 |
+
- {type: RandomZoomOut, fill: 0}
|
| 90 |
+
- {type: RandomIoUCrop, p: 0.8}
|
| 91 |
+
- {type: SanitizeBoundingBoxes, min_size: 8}
|
| 92 |
+
- {type: RandomHorizontalFlip}
|
| 93 |
+
- {type: Resize, size: [640, 640], }
|
| 94 |
+
- {type: SanitizeBoundingBoxes, min_size: 8}
|
| 95 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 96 |
+
- {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
|
| 97 |
+
policy:
|
| 98 |
+
epoch: [4, 250, 400] # list
|
| 99 |
+
ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
|
| 100 |
+
mosaic_prob: 0.5
|
| 101 |
+
|
| 102 |
+
collate_fn:
|
| 103 |
+
mixup_prob: 0.0
|
| 104 |
+
mixup_epochs: [40000, 15000]
|
| 105 |
+
copyblend_prob: 0.0
|
| 106 |
+
copyblend_epochs: [40000, 15000]
|
| 107 |
+
stop_epoch: 468 # 468 + 32
|
| 108 |
+
ema_restart_decay: 0.9999
|
| 109 |
+
base_size: 640
|
| 110 |
+
base_size_repeat: ~
|
| 111 |
+
|
| 112 |
+
val_dataloader:
|
| 113 |
+
total_batch_size: 256
|
| 114 |
+
dataset:
|
| 115 |
+
transforms:
|
| 116 |
+
ops:
|
| 117 |
+
- {type: Resize, size: [640, 640], }
|
| 118 |
+
- {type: ConvertPILImage, dtype: 'float32', scale: True}
|
| 119 |
+
shuffle: False
|
| 120 |
+
num_workers: 16
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
DEIMCriterion:
|
| 124 |
+
losses: ['mal', 'boxes'] # , 'local'
|
| 125 |
+
use_uni_set: False
|
| 126 |
+
|
| 127 |
+
matcher:
|
| 128 |
+
matcher_change_epoch: 450 # FIX This
|
configs/deimv2/deimv2_hgnetv2_s_coco.yml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml'
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_s_coco
|
| 10 |
+
|
| 11 |
+
HGNetv2:
|
| 12 |
+
name: 'B0'
|
| 13 |
+
return_idx: [1, 2, 3]
|
| 14 |
+
freeze_at: -1
|
| 15 |
+
freeze_norm: False
|
| 16 |
+
use_lab: True
|
| 17 |
+
|
| 18 |
+
HybridEncoder:
|
| 19 |
+
in_channels: [256, 512, 1024]
|
| 20 |
+
hidden_dim: 256
|
| 21 |
+
depth_mult: 0.34
|
| 22 |
+
expansion: 0.5
|
| 23 |
+
|
| 24 |
+
version: 'dfine'
|
| 25 |
+
|
| 26 |
+
DEIMTransformer:
|
| 27 |
+
num_layers: 3 # 4 5 6
|
| 28 |
+
eval_idx: -1 # -2 -3 -4
|
| 29 |
+
|
| 30 |
+
optimizer:
|
| 31 |
+
type: AdamW
|
| 32 |
+
params:
|
| 33 |
+
-
|
| 34 |
+
params: '^(?=.*backbone)(?!.*bn).*$'
|
| 35 |
+
lr: 0.0002
|
| 36 |
+
-
|
| 37 |
+
params: '^(?=.*(?:norm|bn)).*$' # except bias
|
| 38 |
+
weight_decay: 0.
|
| 39 |
+
|
| 40 |
+
lr: 0.0004
|
| 41 |
+
betas: [0.9, 0.999]
|
| 42 |
+
weight_decay: 0.0001
|
| 43 |
+
|
| 44 |
+
# Increase to search for the optimal ema
|
| 45 |
+
epoches: 132 # 120 + 4n
|
| 46 |
+
|
| 47 |
+
## Our LR-Scheduler
|
| 48 |
+
flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 49 |
+
no_aug_epoch: 12
|
| 50 |
+
|
| 51 |
+
## Our DataAug
|
| 52 |
+
train_dataloader:
|
| 53 |
+
dataset:
|
| 54 |
+
transforms:
|
| 55 |
+
policy:
|
| 56 |
+
epoch: [4, 64, 120] # list
|
| 57 |
+
|
| 58 |
+
collate_fn:
|
| 59 |
+
ema_restart_decay: 0.9999
|
| 60 |
+
base_size_repeat: 20
|
| 61 |
+
mixup_epochs: [4, 64]
|
| 62 |
+
stop_epoch: 120
|
| 63 |
+
copyblend_prob: 0.5
|
| 64 |
+
# copyblend_epochs: [4, 64] # from v11 to v12: copy-paste continues only half epochs
|
| 65 |
+
copyblend_epochs: [4, 120]
|
| 66 |
+
area_threshold: 100
|
| 67 |
+
num_objects: 3
|
| 68 |
+
with_expand: True
|
| 69 |
+
expand_ratios: [0.1, 0.25]
|
| 70 |
+
|
| 71 |
+
DEIMCriterion:
|
| 72 |
+
matcher:
|
| 73 |
+
# new matcher
|
| 74 |
+
change_matcher: True
|
| 75 |
+
iou_order_alpha: 4.0
|
| 76 |
+
matcher_change_epoch: 100
|
configs/deimv2/deimv2_hgnetv2_x_coco.yml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__include__: [
|
| 2 |
+
'../dataset/coco_detection.yml',
|
| 3 |
+
'../runtime.yml',
|
| 4 |
+
'../base/dataloader.yml',
|
| 5 |
+
'../base/optimizer.yml',
|
| 6 |
+
'../base/deimv2.yml'
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
output_dir: ./outputs/deimv2_hgnetv2_x_coco
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
HGNetv2:
|
| 13 |
+
name: 'B5'
|
| 14 |
+
return_idx: [1, 2, 3]
|
| 15 |
+
freeze_stem_only: True
|
| 16 |
+
freeze_at: 0
|
| 17 |
+
freeze_norm: True
|
| 18 |
+
|
| 19 |
+
HybridEncoder:
|
| 20 |
+
# intra
|
| 21 |
+
hidden_dim: 384
|
| 22 |
+
dim_feedforward: 2048
|
| 23 |
+
|
| 24 |
+
DEIMTransformer:
|
| 25 |
+
feat_channels: [384, 384, 384] # [256, 256, 256]
|
| 26 |
+
reg_scale: 8 # 4
|
| 27 |
+
|
| 28 |
+
# FFN
|
| 29 |
+
dim_feedforward: 2048
|
| 30 |
+
|
| 31 |
+
optimizer:
|
| 32 |
+
type: AdamW
|
| 33 |
+
params:
|
| 34 |
+
-
|
| 35 |
+
params: '^(?=.*backbone)(?!.*norm|bn).*$'
|
| 36 |
+
lr: 0.000005
|
| 37 |
+
-
|
| 38 |
+
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
|
| 39 |
+
weight_decay: 0.
|
| 40 |
+
|
| 41 |
+
lr: 0.0005
|
| 42 |
+
betas: [0.9, 0.999]
|
| 43 |
+
weight_decay: 0.000125
|
| 44 |
+
|
| 45 |
+
# Increase to search for the optimal ema
|
| 46 |
+
epoches: 58 # 72 + 2n
|
| 47 |
+
|
| 48 |
+
## Our LR-Scheduler
|
| 49 |
+
flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
|
| 50 |
+
no_aug_epoch: 8
|
| 51 |
+
|
| 52 |
+
train_dataloader:
|
| 53 |
+
dataset:
|
| 54 |
+
transforms:
|
| 55 |
+
policy:
|
| 56 |
+
epoch: [4, 29, 50] # list
|
| 57 |
+
|
| 58 |
+
collate_fn:
|
| 59 |
+
ema_restart_decay: 0.9998
|
| 60 |
+
base_size_repeat: 3
|
configs/runtime.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
print_freq: 500
|
| 2 |
+
output_dir: './logs'
|
| 3 |
+
checkpoint_freq: 12
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
sync_bn: True
|
| 7 |
+
find_unused_parameters: True
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
use_amp: False
|
| 11 |
+
scaler:
|
| 12 |
+
type: GradScaler
|
| 13 |
+
enabled: True
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
use_ema: False
|
| 17 |
+
ema:
|
| 18 |
+
type: ModelEMA
|
| 19 |
+
decay: 0.9999
|
| 20 |
+
warmups: 1000
|
engine/__init__.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
from . import deim
|
| 5 |
-
from . import data
|
| 6 |
-
from . import optim
|
| 7 |
-
from . import misc
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
from .
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
|
| 3 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# for register purpose
|
| 6 |
+
from . import optim
|
| 7 |
+
from . import data
|
| 8 |
+
from . import deim
|
| 9 |
|
| 10 |
+
from .backbone import *
|
| 11 |
|
| 12 |
+
from .backbone import (
|
| 13 |
+
get_activation,
|
| 14 |
+
FrozenBatchNorm2d,
|
| 15 |
+
freeze_batch_norm2d,
|
| 16 |
+
)
|
engine/backbone/vit_tiny.py
CHANGED
|
@@ -6,14 +6,16 @@ Modified from DINOv3 (https://github.com/facebookresearch/dinov3)
|
|
| 6 |
Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py
|
| 7 |
|
| 8 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import torch
|
| 10 |
import torch.nn as nn
|
| 11 |
import torch.nn.functional as F
|
| 12 |
-
from
|
| 13 |
-
import math
|
| 14 |
-
import numpy as np
|
| 15 |
-
import warnings
|
| 16 |
-
from typing import Literal, Tuple
|
| 17 |
|
| 18 |
|
| 19 |
class RopePositionEmbedding(nn.Module):
|
|
@@ -180,11 +182,11 @@ class Attention(nn.Module):
|
|
| 180 |
head_dim = dim // num_heads
|
| 181 |
self.scale = head_dim ** -0.5
|
| 182 |
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 183 |
-
self.attn_drop =
|
| 184 |
self.proj = nn.Linear(dim, dim)
|
| 185 |
self.proj_drop = nn.Dropout(proj_drop)
|
| 186 |
|
| 187 |
-
def forward(self, x, rope_sincos=None
|
| 188 |
B, N, C = x.shape
|
| 189 |
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 190 |
q, k, v = qkv.unbind(0)
|
|
@@ -200,13 +202,8 @@ class Attention(nn.Module):
|
|
| 200 |
q = torch.cat((q_cls, q_patch), dim=2)
|
| 201 |
k = torch.cat((k_cls, k_patch), dim=2)
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
attn = self.attn_drop(attn)
|
| 206 |
-
|
| 207 |
-
if register_hook: attn.register_hook(self.save_attn_gradients)
|
| 208 |
-
|
| 209 |
-
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
| 210 |
x = self.proj(x)
|
| 211 |
x = self.proj_drop(x)
|
| 212 |
return x
|
|
@@ -220,8 +217,8 @@ class Block(nn.Module):
|
|
| 220 |
self.norm2 = norm_layer(dim)
|
| 221 |
self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
|
| 222 |
|
| 223 |
-
def forward(self, x, rope_sincos=None
|
| 224 |
-
attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos
|
| 225 |
x = x + self.drop_path(attn_output)
|
| 226 |
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
| 227 |
return x
|
|
@@ -260,7 +257,6 @@ class VisionTransformer(nn.Module):
|
|
| 260 |
normalize_coords="separate", shift_coords=None, jitter_coords=None,
|
| 261 |
rescale_coords=None, dtype=None, device=None,
|
| 262 |
)
|
| 263 |
-
|
| 264 |
self.init_weights()
|
| 265 |
|
| 266 |
def init_weights(self):
|
|
@@ -286,28 +282,7 @@ class VisionTransformer(nn.Module):
|
|
| 286 |
def feature_dim(self):
|
| 287 |
return self.embed_dim
|
| 288 |
|
| 289 |
-
def
|
| 290 |
-
B, C, H, W = x.shape
|
| 291 |
-
|
| 292 |
-
x_embed = self._model.patch_embed(x)
|
| 293 |
-
cls_token = self._model.cls_token.expand(x_embed.shape[0], -1, -1)
|
| 294 |
-
x = torch.cat((cls_token, x_embed), dim=1)
|
| 295 |
-
|
| 296 |
-
patch_grid_h = H // self.patch_size
|
| 297 |
-
patch_grid_w = W // self.patch_size
|
| 298 |
-
rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
|
| 299 |
-
|
| 300 |
-
for blk in self._model.blocks:
|
| 301 |
-
x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook)
|
| 302 |
-
x = x[:, 1:, :]
|
| 303 |
-
return {'features': x.transpose(1, 2).reshape(-1, self.embed_dim, patch_grid_h, patch_grid_w)}
|
| 304 |
-
|
| 305 |
-
def forward_pool(self, x):
|
| 306 |
-
features = self.forward_features(x)['features']
|
| 307 |
-
pooled_features = features.mean(dim=[2, 3])
|
| 308 |
-
return {'pooled_features': pooled_features}
|
| 309 |
-
|
| 310 |
-
def forward(self, x, register_hook=False):
|
| 311 |
outs = []
|
| 312 |
B, C, H, W = x.shape
|
| 313 |
|
|
@@ -320,7 +295,7 @@ class VisionTransformer(nn.Module):
|
|
| 320 |
rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
|
| 321 |
|
| 322 |
for i, blk in enumerate(self._model.blocks):
|
| 323 |
-
x = blk(x, rope_sincos=rope_sincos
|
| 324 |
if i in self.return_layers:
|
| 325 |
outs.append((x[:, 1:], x[:, 0]))
|
| 326 |
return outs
|
|
|
|
| 6 |
Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py
|
| 7 |
|
| 8 |
"""
|
| 9 |
+
import math
|
| 10 |
+
import warnings
|
| 11 |
+
from functools import partial
|
| 12 |
+
from typing import List, Literal, Tuple
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
import torch
|
| 16 |
import torch.nn as nn
|
| 17 |
import torch.nn.functional as F
|
| 18 |
+
from torch import nn
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class RopePositionEmbedding(nn.Module):
|
|
|
|
| 182 |
head_dim = dim // num_heads
|
| 183 |
self.scale = head_dim ** -0.5
|
| 184 |
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 185 |
+
self.attn_drop = attn_drop
|
| 186 |
self.proj = nn.Linear(dim, dim)
|
| 187 |
self.proj_drop = nn.Dropout(proj_drop)
|
| 188 |
|
| 189 |
+
def forward(self, x, rope_sincos=None):
|
| 190 |
B, N, C = x.shape
|
| 191 |
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 192 |
q, k, v = qkv.unbind(0)
|
|
|
|
| 202 |
q = torch.cat((q_cls, q_patch), dim=2)
|
| 203 |
k = torch.cat((k_cls, k_patch), dim=2)
|
| 204 |
|
| 205 |
+
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop)
|
| 206 |
+
x = x.transpose(1, 2).reshape([B, N, C])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
x = self.proj(x)
|
| 208 |
x = self.proj_drop(x)
|
| 209 |
return x
|
|
|
|
| 217 |
self.norm2 = norm_layer(dim)
|
| 218 |
self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
|
| 219 |
|
| 220 |
+
def forward(self, x, rope_sincos=None):
|
| 221 |
+
attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos)
|
| 222 |
x = x + self.drop_path(attn_output)
|
| 223 |
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
| 224 |
return x
|
|
|
|
| 257 |
normalize_coords="separate", shift_coords=None, jitter_coords=None,
|
| 258 |
rescale_coords=None, dtype=None, device=None,
|
| 259 |
)
|
|
|
|
| 260 |
self.init_weights()
|
| 261 |
|
| 262 |
def init_weights(self):
|
|
|
|
| 282 |
def feature_dim(self):
|
| 283 |
return self.embed_dim
|
| 284 |
|
| 285 |
+
def forward(self, x):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
outs = []
|
| 287 |
B, C, H, W = x.shape
|
| 288 |
|
|
|
|
| 295 |
rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
|
| 296 |
|
| 297 |
for i, blk in enumerate(self._model.blocks):
|
| 298 |
+
x = blk(x, rope_sincos=rope_sincos)
|
| 299 |
if i in self.return_layers:
|
| 300 |
outs.append((x[:, 1:], x[:, 0]))
|
| 301 |
return outs
|