diff --git a/configs/base/dataloader.yml b/configs/base/dataloader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..22de3aa4645e9620c0297396da56c06c8b47e8a4
--- /dev/null
+++ b/configs/base/dataloader.yml
@@ -0,0 +1,39 @@
+
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 72 # epoch in [71, ~) stop `ops`
+        ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']  # Mosaicを除外
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    base_size_repeat: 3
+    stop_epoch: 72 # epoch in [72, ~) stop `multiscales`
+
+  shuffle: True
+  total_batch_size: 32 # total batch size equals to 32 (4 * 8)
+  num_workers: 4
+
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  total_batch_size: 64
+  num_workers: 4
diff --git a/configs/base/deim.yml b/configs/base/deim.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3aa63588df77c2063c00632a90dfd45dbdee0ef5
--- /dev/null
+++ b/configs/base/deim.yml
@@ -0,0 +1,48 @@
+# Dense O2O
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 29, 50]   # list 
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+      mosaic_prob: 0.5
+
+  collate_fn:
+    mixup_prob: 0.5
+    mixup_epochs: [4, 29]
+    stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
+
+# Unfreezing BN
+HGNetv2:
+  freeze_at: -1         # 0 default
+  freeze_norm: False    # True default
+
+# Activation
+DFINETransformer:
+  activation: silu
+  mlp_act: silu
+
+## Our LR-Scheduler
+lrsheduler: flatcosine
+lr_gamma: 0.5
+warmup_iter: 2000
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+## Our Loss
+DEIMCriterion:
+  weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
+  losses: ['mal', 'boxes', 'local']
+  gamma: 1.5
\ No newline at end of file
diff --git a/configs/base/deimv2.yml b/configs/base/deimv2.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7428898f8f734a7cb48cadcd4c7a9bd6f719d9a2
--- /dev/null
+++ b/configs/base/deimv2.yml
@@ -0,0 +1,144 @@
+task: detection
+
+model: DEIM
+criterion: DEIMCriterion
+postprocessor: PostProcessor
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+checkpoint_freq: 5    # save freq
+
+DEIM:
+  backbone: HGNetv2
+  encoder: HybridEncoder
+  decoder: DEIMTransformer
+
+HGNetv2:
+  name: 'B4'
+  return_idx: [1, 2, 3]
+  freeze_at: -1         # 0 default
+  freeze_stem_only: True
+  freeze_norm: False    # True default
+  pretrained: True
+  local_model_dir: ./weight/hgnetv2/
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+  # New
+  version: deim
+  csp_type: csp2
+  fuse_op: sum
+
+DEIMTransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  eval_idx: -1
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+
+  reg_max: 32
+  reg_scale: 4
+  layer_scale: 1  # 2
+
+  num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
+  cross_attn_method: default # default, discrete
+  query_select_method: default # default, agnostic
+
+  # Act
+  activation: silu
+  mlp_act: silu
+
+  # FFN
+  dim_feedforward: 2048
+
+PostProcessor:
+  num_top_queries: 300
+
+
+## DEIM LR-Scheduler
+epoches: 58 # 72 + 2n  # Increase to search for the optimal ema
+
+lrsheduler: flatcosine
+lr_gamma: 0.5
+warmup_iter: 2000
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+## Dense O2O: Mosaic + Mixup + CopyBlend
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      # Mosaic options
+      policy:
+        epoch: [4, 29, 50]   # list 
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+      mosaic_prob: 0.5
+
+  collate_fn:
+    # Mixup options
+    mixup_prob: 0.5
+    mixup_epochs: [4, 29]
+    stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
+    # CopyBlend options
+    copyblend_prob: 0.5
+    copyblend_epochs: [4, 50]
+    area_threshold: 100
+    num_objects: 3
+    with_expand: True
+    expand_ratios: [0.1, 0.25]
+
+    ema_restart_decay: 0.9999
+    base_size_repeat: 4
+
+## DEIM Loss
+DEIMCriterion:
+  weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
+  losses: ['mal', 'boxes', 'local']
+  gamma: 1.5
+  alpha: 0.75
+  reg_max: 32
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
+    # change matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 45
\ No newline at end of file
diff --git a/configs/base/dfine_hgnetv2.yml b/configs/base/dfine_hgnetv2.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e9de6d1b10341f1e5f3d525213aacb48a19e5aaf
--- /dev/null
+++ b/configs/base/dfine_hgnetv2.yml
@@ -0,0 +1,90 @@
+task: detection
+
+model: DEIM
+criterion: DEIMCriterion
+postprocessor: PostProcessor
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+checkpoint_freq: 4    # save freq
+
+DEIM:
+  backbone: HGNetv2
+  encoder: HybridEncoder
+  decoder: DFINETransformer
+
+# Add, default for step lr scheduler 
+lrsheduler: flatcosine
+lr_gamma: 1
+warmup_iter: 500
+flat_epoch: 4000000
+no_aug_epoch: 0
+
+HGNetv2:
+  pretrained: True
+  local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+
+DFINETransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  eval_idx: -1
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+
+  # NEW
+  reg_max: 32
+  reg_scale: 4
+
+  # Auxiliary decoder layers dimension scaling
+  # "eg. If num_layers: 6 eval_idx: -4,
+  # then layer 3, 4, 5 are auxiliary decoder layers."
+  layer_scale: 1  # 2
+
+
+  num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
+  cross_attn_method: default # default, discrete
+  query_select_method: default # default, agnostic
+
+
+PostProcessor:
+  num_top_queries: 300
+
+
+DEIMCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
+  losses: ['vfl', 'boxes', 'local']
+  alpha: 0.75
+  gamma: 2.0
+  reg_max: 32
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
\ No newline at end of file
diff --git a/configs/base/optimizer.yml b/configs/base/optimizer.yml
new file mode 100644
index 0000000000000000000000000000000000000000..db490088f0220b72309f1d7a4ab1ca6aafb45322
--- /dev/null
+++ b/configs/base/optimizer.yml
@@ -0,0 +1,35 @@
+use_amp: True
+use_ema: True
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 1000
+  start: 0
+
+epoches: 72
+clip_max_norm: 0.1
+
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.0000125
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [500]
+  gamma: 0.1
+
+lr_warmup_scheduler:
+  type: LinearWarmup
+  warmup_duration: 500
diff --git a/configs/base/rt_deim.yml b/configs/base/rt_deim.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d195ce9ea09165289e4c72301ea14b7ac45971ce
--- /dev/null
+++ b/configs/base/rt_deim.yml
@@ -0,0 +1,49 @@
+# Dense O2O
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 29, 50]   # list 
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+      mosaic_prob: 0.5
+
+  collate_fn:
+    mixup_prob: 0.5
+    mixup_epochs: [4, 29]
+    stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
+
+# Unfreezing BN
+PResNet:
+  freeze_at: -1     # default 0
+  freeze_norm: False   # default True
+
+# Activation
+RTDETRTransformerv2:
+  query_pos_method: as_reg
+  activation: silu
+  mlp_act: silu
+
+## Our LR-Scheduler
+lrsheduler: flatcosine
+lr_gamma: 0.5
+warmup_iter: 2000
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+## Our Loss
+DEIMCriterion:
+  weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2}
+  losses: ['mal', 'boxes', ]
+  gamma: 1.5
\ No newline at end of file
diff --git a/configs/base/rt_optimizer.yml b/configs/base/rt_optimizer.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0dbada062e6325083f3e79991b1965d2c0fd7901
--- /dev/null
+++ b/configs/base/rt_optimizer.yml
@@ -0,0 +1,37 @@
+use_amp: True
+use_ema: True 
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+  start: 0
+
+epoches: 72
+clip_max_norm: 0.1
+
+train_dataloader:
+  total_batch_size: 16
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+  
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [1000]
+  gamma: 0.1
+
+
+lr_warmup_scheduler:
+  type: LinearWarmup
+  warmup_duration: 2000
diff --git a/configs/base/rtdetrv2_r50vd.yml b/configs/base/rtdetrv2_r50vd.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e00de349bc6ef8d1a09a19634dc6e9d0c6b4ac41
--- /dev/null
+++ b/configs/base/rtdetrv2_r50vd.yml
@@ -0,0 +1,90 @@
+task: detection
+
+model: DEIM
+criterion: DEIMCriterion
+postprocessor: PostProcessor
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+checkpoint_freq: 4    # save freq
+
+DEIM: 
+  backbone: PResNet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformerv2
+  
+
+# Add, default for step lr scheduler 
+lrsheduler: flatcosine
+lr_gamma: 1
+warmup_iter: 2000
+flat_epoch: 4000000
+no_aug_epoch: 0
+
+PResNet:
+  depth: 50
+  variant: d
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  num_stages: 4
+  freeze_norm: True
+  pretrained: True 
+  local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/
+
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+  version: rt_detrv2    # pay attention to this
+
+
+RTDETRTransformerv2:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0    # 1.0 0.4
+
+  eval_idx: -1
+
+  # NEW, can be chosen 
+  num_points: [4, 4, 4]     # [3,3,3] [2,2,2]
+  cross_attn_method: default  # default, discrete
+  query_select_method: default  # default, agnostic 
+
+
+PostProcessor:
+  num_top_queries: 300
+
+DEIMCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+  use_uni_set: False
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
\ No newline at end of file
diff --git a/configs/coco_detection.yml b/configs/dataset/coco_detection.yml
similarity index 100%
rename from configs/coco_detection.yml
rename to configs/dataset/coco_detection.yml
diff --git a/configs/dataset/crowdhuman_detection.yml b/configs/dataset/crowdhuman_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f4dc707db67f651533671034dea92144083f74f9
--- /dev/null
+++ b/configs/dataset/crowdhuman_detection.yml
@@ -0,0 +1,41 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 2 # your dataset classes
+remap_mscoco_category: False
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train
+    ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val
+    ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/configs/dataset/custom_detection.yml b/configs/dataset/custom_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..35435ad68e29d99d8f9f69100cd56a2c403fe710
--- /dev/null
+++ b/configs/dataset/custom_detection.yml
@@ -0,0 +1,41 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 777 # your dataset classes
+remap_mscoco_category: False
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /data/yourdataset/train
+    ann_file: /data/yourdataset/train/train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /data/yourdataset/val
+    ann_file: /data/yourdataset/val/val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/configs/dataset/obj365_detection.yml b/configs/dataset/obj365_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e843e85bf2d53de3e61fdd109cf51ab9fc9957e3
--- /dev/null
+++ b/configs/dataset/obj365_detection.yml
@@ -0,0 +1,41 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 366
+remap_mscoco_category: False
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/Dataset/objects365/train
+    ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/Dataset/objects365/val
+    ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/configs/dataset/voc_detection.yml b/configs/dataset/voc_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1f9ceeb8881653d496ac5fd02c465aea5306d72f
--- /dev/null
+++ b/configs/dataset/voc_detection.yml
@@ -0,0 +1,40 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 20
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: VOCDetection
+    root: ./dataset/voc/
+    ann_file: trainval.txt
+    label_file: label_list.txt
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: VOCDetection
+    root: ./dataset/voc/
+    ann_file: test.txt
+    label_file: label_list.txt
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/configs/deim_dfine/deim_hgnetv2_l_coco.yml b/configs/deim_dfine/deim_hgnetv2_l_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6b35a78e453d52d15291fd91dd04c9a55cfec8af
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_l_coco.yml
@@ -0,0 +1,37 @@
+__include__: [
+  './dfine_hgnetv2_l_coco.yml',
+  '../base/deim.yml'
+]
+
+output_dir: ./outputs/deim_hgnetv2_l_coco
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000025
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+  
+# Increase to search for the optimal ema
+epoches: 58 # 72 + 2n
+
+## Our LR-Scheduler
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 29, 50]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 29]
+    stop_epoch: 50
\ No newline at end of file
diff --git a/configs/deim_dfine/deim_hgnetv2_m_coco.yml b/configs/deim_dfine/deim_hgnetv2_m_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9fa5167620c57f6fdb14892dd1cf9a00839fde92
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_m_coco.yml
@@ -0,0 +1,39 @@
+__include__: [
+  './dfine_hgnetv2_m_coco.yml',
+  '../base/deim.yml'
+]
+
+output_dir: ./outputs/deim_hgnetv2_m_coco
+
+optimizer:
+  type: AdamW
+  params: 
+    -
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.00004
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 102 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 49, 90]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 49]
+    stop_epoch: 90
\ No newline at end of file
diff --git a/configs/deim_dfine/deim_hgnetv2_n_coco.yml b/configs/deim_dfine/deim_hgnetv2_n_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..62db245d96e4cb402e774efe9cb26fc1f401e40e
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_n_coco.yml
@@ -0,0 +1,44 @@
+__include__: [
+  './dfine_hgnetv2_n_coco.yml',
+  '../base/deim.yml'
+]
+
+output_dir: ./deim_outputs/deim_hgnetv2_n_coco
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 7800    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/deim_dfine/deim_hgnetv2_s_coco.yml b/configs/deim_dfine/deim_hgnetv2_s_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..68ea99aae9926cefb33b3380d37a9cbd70ed28eb
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_s_coco.yml
@@ -0,0 +1,39 @@
+__include__: [
+  './dfine_hgnetv2_s_coco.yml',
+  '../base/deim.yml'
+]
+
+output_dir: ./outputs/deim_hgnetv2_s_coco
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.0002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'     # except bias
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 120]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 64]
+    stop_epoch: 120
\ No newline at end of file
diff --git a/configs/deim_dfine/deim_hgnetv2_x_coco.yml b/configs/deim_dfine/deim_hgnetv2_x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8ec7f1b611089c6b8aa3a974c980f862cf821679
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_x_coco.yml
@@ -0,0 +1,37 @@
+__include__: [
+  './dfine_hgnetv2_x_coco.yml',
+  '../base/deim.yml'
+]
+
+output_dir: ./outputs/deim_hgnetv2_x_coco
+  
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000005   
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+  
+# Increase to search for the optimal ema
+epoches: 58 # 72 + 2n
+
+## Our LR-Scheduler
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 29, 50]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 29]
+    stop_epoch: 50
\ No newline at end of file
diff --git a/configs/deim_dfine/dfine_hgnetv2_l_coco.yml b/configs/deim_dfine/dfine_hgnetv2_l_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..16c6002434659b5918db2f08955f054ca9c83d81
--- /dev/null
+++ b/configs/deim_dfine/dfine_hgnetv2_l_coco.yml
@@ -0,0 +1,44 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./outputs/dfine_hgnetv2_l_coco
+
+
+HGNetv2:
+  name: 'B4'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0000125
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+
+# Increase to search for the optimal ema
+epoches: 80 # 72 + 2n
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 72
+  collate_fn:
+    stop_epoch: 72
+    ema_restart_decay: 0.9999
+    base_size_repeat: 4
diff --git a/configs/deim_dfine/dfine_hgnetv2_m_coco.yml b/configs/deim_dfine/dfine_hgnetv2_m_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6b5f917bd5723b931bb09e0389eb49e2e15af8c8
--- /dev/null
+++ b/configs/deim_dfine/dfine_hgnetv2_m_coco.yml
@@ -0,0 +1,60 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./output/dfine_hgnetv2_m_coco
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B2'
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+DFINETransformer:
+  num_layers: 4  # 5 6
+  eval_idx: -1  # -2 -3
+
+HybridEncoder:
+  in_channels: [384, 768, 1536]
+  hidden_dim: 256
+  depth_mult: 0.67
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00002
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.00002
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 120
+  collate_fn:
+    stop_epoch: 120
+    ema_restart_decay: 0.9999
+    base_size_repeat: 6
diff --git a/configs/deim_dfine/dfine_hgnetv2_n_coco.yml b/configs/deim_dfine/dfine_hgnetv2_n_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c45e65357cbfe547851e8b7385be67ca7226f6cd
--- /dev/null
+++ b/configs/deim_dfine/dfine_hgnetv2_n_coco.yml
@@ -0,0 +1,82 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./output/dfine_hgnetv2_n_coco
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+
+  num_layers: 3
+  eval_idx: -1
+
+  num_points: [6, 6]
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 4n
+train_dataloader:
+  total_batch_size: 128
+  dataset:
+    transforms:
+      policy:
+        epoch: 148
+  collate_fn:
+    stop_epoch: 148
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 256
diff --git a/configs/deim_dfine/dfine_hgnetv2_s_coco.yml b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..33857bc47fe0311586a28b16f7c2d58af909ffbc
--- /dev/null
+++ b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
@@ -0,0 +1,61 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./output/dfine_hgnetv2_s_coco
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+DFINETransformer:
+  num_layers: 3  # 4 5 6
+  eval_idx: -1  # -2 -3 -4
+
+HybridEncoder:
+  in_channels: [256, 512, 1024]
+  hidden_dim: 256
+  depth_mult: 0.34
+  expansion: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0001
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0001
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 120
+  collate_fn:
+    stop_epoch: 120
+    ema_restart_decay: 0.9999
+    base_size_repeat: 20
diff --git a/configs/deim_dfine/dfine_hgnetv2_x_coco.yml b/configs/deim_dfine/dfine_hgnetv2_x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..46ec15753906b37f9d0f61bf8e0637c77c295251
--- /dev/null
+++ b/configs/deim_dfine/dfine_hgnetv2_x_coco.yml
@@ -0,0 +1,56 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./output/dfine_hgnetv2_x_coco
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B5'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+DFINETransformer:
+  feat_channels: [384, 384, 384]
+  reg_scale: 8
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0000025
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+
+# Increase to search for the optimal ema
+epoches: 80 # 72 + 2n
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 72
+  collate_fn:
+    stop_epoch: 72
+    ema_restart_decay: 0.9998
+    base_size_repeat: 3
diff --git a/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml b/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
new file mode 100644
index 0000000000000000000000000000000000000000..28fcd4c12f2ab91bffb111e33b18b612a9bda12d
--- /dev/null
+++ b/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
@@ -0,0 +1,50 @@
+__include__: [
+  './dfine_hgnetv2_x_obj2coco.yml',
+  '../../base/deim.yml'
+]
+
+output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e
+  
+HGNetv2:
+  freeze_at: 0         # 0 default
+  freeze_norm: True    # True default
+  
+# Activation
+DFINETransformer:
+  activation: relu
+  mlp_act: relu
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0000025
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+  
+# Increase to search for the optimal ema
+epoches: 24 # 72 + 2n
+
+## Our LR-Scheduler
+lrsheduler: flatcosine
+lr_gamma: 1
+warmup_iter: 0    # 0
+flat_epoch: 12000    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 4
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [2, 12, 20]   # list 
+
+  collate_fn:
+    mixup_epochs: [2, 12]
+    stop_epoch: 20
\ No newline at end of file
diff --git a/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml b/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c9c711b678e953d7589ad106c2aef064bd906ae6
--- /dev/null
+++ b/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
@@ -0,0 +1,57 @@
+__include__: [
+  '../../dataset/coco_detection.yml',
+  '../../runtime.yml',
+  '../../base/dataloader.yml',
+  '../../base/optimizer.yml',
+  '../../base/dfine_hgnetv2.yml',
+]
+
+output_dir: ./outputs/dfine_hgnetv2_x_obj2coco
+
+HGNetv2:
+  name: 'B5'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+DFINETransformer:
+  feat_channels: [384, 384, 384]
+  reg_scale: 8
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0000025
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+
+epoches: 36 # Early stop
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 30
+  collate_fn:
+    stop_epoch: 30
+    ema_restart_decay: 0.9999
+    base_size_repeat: 3
+
+ema:
+  warmups: 0
+
+lr_warmup_scheduler:
+  warmup_duration: 0
diff --git a/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml b/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..538a9afb8b9e1187d3ebf2cccd963d6eec35fd70
--- /dev/null
+++ b/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
@@ -0,0 +1,36 @@
+__include__: [
+  './rtdetrv2_r101vd_6x_coco.yml',
+  '../base/rt_deim.yml',
+]
+
+output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.000002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# change part
+epoches: 60
+flat_epoch: 34    # 4 + 60 / 2
+no_aug_epoch: 2
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 34, 58]   # list 
+      
+  collate_fn:
+    mixup_epochs: [4, 34]
+    stop_epoch: 58
diff --git a/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml b/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2b2069fa727453a3f34f9a67c5b9f59ffce7773d
--- /dev/null
+++ b/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
@@ -0,0 +1,32 @@
+__include__: [
+  './rtdetrv2_r18vd_120e_coco.yml',
+  '../base/rt_deim.yml',
+]
+
+output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# change part
+epoches: 120
+flat_epoch: 64    # 4 + 120 / 2
+no_aug_epoch: 3
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 117]   # list 
+      
+  collate_fn:
+    mixup_epochs: [4, 64]
+    stop_epoch: 117
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml b/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fb9d23f72a49df713c5d0c4e91ed5e1b95d57e73
--- /dev/null
+++ b/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
@@ -0,0 +1,36 @@
+__include__: [
+  './rtdetrv2_r34vd_120e_coco.yml',
+  '../base/rt_deim.yml',
+]
+
+output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.0001
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# change part
+epoches: 120
+flat_epoch: 64
+no_aug_epoch: 3
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 117]   # list 
+      
+  collate_fn:
+    mixup_epochs: [4, 64]
+    stop_epoch: 117
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml b/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7427c57248f0f2740512fa65b6e5642d2da99709
--- /dev/null
+++ b/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
@@ -0,0 +1,35 @@
+__include__: [
+  './rtdetrv2_r50vd_6x_coco.yml',
+  '../base/rt_deim.yml',
+]
+
+output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# change part
+epoches: 60
+flat_epoch: 34    # 4 + 60 / 2
+no_aug_epoch: 2
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 34, 58]   # list 
+      
+  collate_fn:
+    mixup_epochs: [4, 34]
+    stop_epoch: 58
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml b/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b08bdc3713cb32fc23fc95ea9fa37f94fc7feb43
--- /dev/null
+++ b/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
@@ -0,0 +1,39 @@
+__include__: [
+  './rtdetrv2_r50vd_m_7x_coco.yml',
+  '../base/rt_deim.yml',
+]
+
+output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco
+
+RTDETRTransformerv2:
+  eval_idx: 2 # use 3th decoder layer to eval
+  num_layers: 3
+  
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# change part
+epoches: 60
+flat_epoch: 34    # 4 + 60 / 2
+no_aug_epoch: 2
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 34, 58]   # list 
+      
+  collate_fn:
+    mixup_epochs: [4, 34]
+    stop_epoch: 58
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..256a089b2886fc1830923e957732b7a07bc4273b
--- /dev/null
+++ b/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
@@ -0,0 +1,40 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/rt_optimizer.yml',
+  '../base/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./outputs/rtdetrv2_r101vd_6x_coco
+
+
+PResNet:
+  depth: 101
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+
+RTDETRTransformerv2:
+  feat_channels: [384, 384, 384]
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'    # only encoder + decoder norm
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..04d4d533da9a0bcd6f64e9eb2ddbcdd76ddf8edf
--- /dev/null
+++ b/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
@@ -0,0 +1,44 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/rt_optimizer.yml',
+  '../base/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_120e_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+RTDETRTransformerv2:
+  num_layers: 3
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9cfb522a1870818e2411d863b40f5f35b2289b12
--- /dev/null
+++ b/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
@@ -0,0 +1,57 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/rt_optimizer.yml',
+  '../base/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./outputs/rtdetrv2_r34vd_120e_coco
+
+
+PResNet:
+  depth: 34
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 4
+
+
+epoches: 120
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00005
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.00005
+      weight_decay: 0.
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    stop_epoch: 117
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3ffe8505be0e15b5f22f088d9215da86fdabc970
--- /dev/null
+++ b/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
@@ -0,0 +1,25 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/rt_optimizer.yml',
+  '../base/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./outputs/rtdetrv2_r50vd_6x_coco
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
\ No newline at end of file
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..760a3866760fd487a4788a9d9efa249e5b65d6a4
--- /dev/null
+++ b/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
@@ -0,0 +1,43 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/rt_optimizer.yml',
+  '../base/rtdetrv2_r50vd.yml',
+]
+
+output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco
+
+
+HybridEncoder:
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  eval_idx: 2 # use 3th decoder layer to eval
+
+
+epoches: 84
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 81
+  collate_fn:
+    stop_epoch: 81
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_dinov3_l_coco.yml b/configs/deimv2/deimv2_dinov3_l_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7bb97cb7bbcad41c59ae8d9f01994862296d79b5
--- /dev/null
+++ b/configs/deimv2/deimv2_dinov3_l_coco.yml
@@ -0,0 +1,104 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+
+output_dir: ./outputs/deimv2_dinov3_l_coco
+
+DEIM:
+  backbone: DINOv3STAs
+
+DINOv3STAs:
+  name: dinov3_vits16
+  weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth
+  interaction_indexes: [5,8,11]   # only need the [1/8, 1/16, 1/32]
+  finetune: True
+  conv_inplane: 32
+  hidden_dim: 224
+
+HybridEncoder:
+  in_channels: [224, 224, 224]
+  hidden_dim: 224
+  dim_feedforward: 896
+
+DEIMTransformer:
+  feat_channels: [224, 224, 224]
+  hidden_dim: 224
+  num_layers: 4
+  eval_idx: -1
+  dim_feedforward: 1792
+
+## DEIM LR-Scheduler
+epoches: 68 # 72 + 2n  # Increase to search for the optimal ema
+
+lrsheduler: flatcosine
+lr_gamma: 0.5
+warmup_iter: 2000
+flat_epoch: 34    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+## Optimizer
+optimizer:
+  type: AdamW
+  params: 
+    -
+      # except norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
+      lr: 0.0000125
+    -
+      # including norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
+      lr: 0.0000125
+      weight_decay: 0.
+    - 
+      # including norm/bn/bias except for the self.dinov3
+      params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+
+## Dense O2O: Mosaic + Mixup + CopyBlend
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 34, 60]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 34]
+    stop_epoch: 60
+    copyblend_epochs: [4, 60]
+    base_size_repeat: 3
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+
+## DEIM Loss
+DEIMCriterion:
+  matcher:
+    matcher_change_epoch: 50
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_dinov3_m_coco.yml b/configs/deimv2/deimv2_dinov3_m_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c4b17334138bad9e8bd431e858b50abc6c2514b3
--- /dev/null
+++ b/configs/deimv2/deimv2_dinov3_m_coco.yml
@@ -0,0 +1,107 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+output_dir: ./outputs/deimv2_dinov3_m_coco
+
+DEIM:
+  backbone: DINOv3STAs
+
+DINOv3STAs:
+  name: vit_tinyplus
+  embed_dim: 256
+  weights_path: ./ckpts/vittplus_distill.pt
+  interaction_indexes: [3, 7, 11]   # only need the [1/8, 1/16, 1/32]
+  num_heads: 4
+
+HybridEncoder:
+  in_channels: [256, 256, 256]
+  depth_mult: 1
+  expansion: 0.67
+  hidden_dim: 256
+  dim_feedforward: 512
+
+
+DEIMTransformer:
+  feat_channels: [256, 256, 256]
+  hidden_dim: 256
+  dim_feedforward: 512
+  num_layers: 4  # 4 5 6
+  eval_idx: -1  # -2 -3 -4
+
+optimizer:
+  type: AdamW
+
+  params: 
+    -
+      # except norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
+      lr: 0.000025
+    -
+      # including norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
+      lr: 0.000025
+      weight_decay: 0.
+    - 
+      # including norm/bn/bias except for the self.dinov3
+      params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+epoches: 102 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+
+## Our DataAug
+train_dataloader:
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 49, 90]   # list 
+
+  collate_fn:
+    mixup_prob: 0.5
+    ema_restart_decay: 0.9999
+    base_size_repeat: 6
+    mixup_epochs: [4, 49]
+    stop_epoch: 90
+    copyblend_epochs: [4, 90]
+
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+
+DEIMCriterion:
+  matcher:
+    # new matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 80
diff --git a/configs/deimv2/deimv2_dinov3_s_coco.yml b/configs/deimv2/deimv2_dinov3_s_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c8ec7ea3b95902fcd88592cbd5ce478a09b788c0
--- /dev/null
+++ b/configs/deimv2/deimv2_dinov3_s_coco.yml
@@ -0,0 +1,108 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+output_dir: ./outputs/deimv2_dinov3_s_coco
+
+DEIM:
+  backbone: DINOv3STAs
+
+DINOv3STAs:
+  name: vit_tiny
+  embed_dim: 192
+  weights_path: ./ckpts/vitt_distill.pt
+  interaction_indexes: [3, 7, 11]   # only need the [1/8, 1/16, 1/32]
+  num_heads: 3
+
+HybridEncoder:
+  in_channels: [192, 192, 192]
+  depth_mult: 0.67
+  expansion: 0.34
+  hidden_dim: 192
+  dim_feedforward: 512
+
+DEIMTransformer:
+  feat_channels: [192, 192, 192]
+  hidden_dim: 192
+  dim_feedforward: 512
+  num_layers: 4  # 4 5 6
+  eval_idx: -1  # -2 -3 -4
+
+
+## Optimizer
+optimizer:
+  type: AdamW
+
+  params: 
+    -
+      # except norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
+      lr: 0.000025
+    -
+      # including all norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
+      lr: 0.000025
+      weight_decay: 0.
+    - 
+      # including all norm/bn/bias except for the self.dinov3
+      params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader:
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 64, 120]   # list 
+
+  collate_fn:
+    base_size: 640
+    mixup_prob: 0.5
+    ema_restart_decay: 0.9999
+    base_size_repeat: 20
+    mixup_epochs: [4, 64]
+    stop_epoch: 120
+    copyblend_epochs: [4, 120]
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+
+DEIMCriterion:
+  matcher:
+    # change matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 100
diff --git a/configs/deimv2/deimv2_dinov3_x_coco.yml b/configs/deimv2/deimv2_dinov3_x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f85120b4905dc25318f564d749fd2ba789811fed
--- /dev/null
+++ b/configs/deimv2/deimv2_dinov3_x_coco.yml
@@ -0,0 +1,94 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+
+output_dir: ./outputs/deimv2_dinov3_x_coco
+  
+DEIM:
+  backbone: DINOv3STAs
+
+DINOv3STAs:
+  name: dinov3_vits16plus
+  weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth
+  interaction_indexes: [5,8,11]   # only need the [1/8, 1/16, 1/32]
+  finetune: True
+  conv_inplane: 64
+  hidden_dim: 256
+  
+HybridEncoder:
+  in_channels: [256, 256, 256]
+  # intra
+  hidden_dim: 256
+  dim_feedforward: 1024
+
+  # cross 
+  expansion: 1.25
+  depth_mult: 1.37
+
+DEIMTransformer:
+  num_layers: 6
+  eval_idx: -1
+  feat_channels: [256, 256, 256]
+  # reg_scale: 8
+  hidden_dim: 256
+  dim_feedforward: 2048
+
+optimizer:
+  type: AdamW
+  params: 
+    -
+      # except norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
+      lr: 0.00001
+    -
+      # including norm/bn/bias in self.dinov3
+      params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
+      lr: 0.00001
+      weight_decay: 0.
+    - 
+      # including norm/bn/bias except for the self.dinov3
+      params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+## Dense O2O: Mosaic + Mixup + CopyBlend
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 29, 50]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 29]
+    stop_epoch: 50
+    copyblend_epochs: [4, 50]
+    base_size_repeat: 3
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_atto_coco.yml b/configs/deimv2/deimv2_hgnetv2_atto_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4d770494fd5b99bb50199ae809920c57a97d98a7
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_atto_coco.yml
@@ -0,0 +1,123 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_atto_coco
+
+DEIM:
+  encoder: LiteEncoder
+
+HGNetv2:
+  name: 'Atto'
+  return_idx: [2]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+LiteEncoder:
+  in_channels: [256]
+  feat_strides: [16]
+  # intra
+  hidden_dim: 64
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+  act: 'silu'
+
+
+DEIMTransformer:
+  feat_channels: [64, 64]
+  feat_strides: [16, 32]
+  hidden_dim: 64
+  num_levels: 2
+  num_points: [4, 2]
+
+  num_layers: 3
+  eval_idx: -1
+  num_queries: 100
+
+  # FFN
+  dim_feedforward: 160
+
+  # New options for DEIMv2
+  share_bbox_head: True
+  use_gateway: False
+
+# Increase to search for the optimal ema
+epoches: 500 # 468 + 32
+
+## Our LR-Scheduler
+warmup_iter: 4000
+flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 32
+lr_gamma: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    - params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.001
+    - params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.001
+      weight_decay: 0.
+    - params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'    # except bias
+      weight_decay: 0.
+
+  lr: 0.002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+eval_spatial_size: [320, 320]
+train_dataloader:
+  total_batch_size: 128
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 12}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [320, 320], }
+        - {type: SanitizeBoundingBoxes, min_size: 12}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 250, 400]   # list 
+      mosaic_prob: 0.3
+
+  collate_fn:
+    mixup_prob: 0.0
+    mixup_epochs: [40000, 15000]
+    copyblend_prob: 0.0
+    copyblend_epochs: [40000, 15000]
+
+    stop_epoch: 468 # 468 + 32
+    ema_restart_decay: 0.9999
+    base_size: 320
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 256
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [320, 320], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  num_workers: 16
+
+
+DEIMCriterion:
+  losses: ['mal', 'boxes']    # , 'local'
+  use_uni_set: False
+
+  matcher:
+    matcher_change_epoch: 450   # FIX This
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_femto_coco.yml b/configs/deimv2/deimv2_hgnetv2_femto_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7a9a2952da71aa0d01a5fd5cc28b57b346f49837
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_femto_coco.yml
@@ -0,0 +1,128 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_femto_coco
+
+DEIM:
+  encoder: LiteEncoder
+
+HGNetv2:
+  name: 'Femto'
+  return_idx: [2]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+LiteEncoder:
+  in_channels: [512]
+  feat_strides: [16]
+
+  # intra
+  hidden_dim: 96
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+  act: 'silu'
+
+
+DEIMTransformer:
+  feat_channels: [96, 96]
+  feat_strides: [16, 32]
+  hidden_dim: 96
+  num_levels: 2
+  num_points: [4, 2]
+
+  num_layers: 3
+  eval_idx: -1
+  num_queries: 150
+
+  # FFN
+  dim_feedforward: 256
+
+  # New options for DEIMv2
+  share_bbox_head: True
+  use_gateway: False
+
+# Increase to search for the optimal ema
+epoches: 500 # 468 + 32
+
+## Our LR-Scheduler
+warmup_iter: 4000
+flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 32
+lr_gamma: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0008
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0008
+      weight_decay: 0.
+    -  # not opt
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0016
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+eval_spatial_size: [416, 416]
+train_dataloader:
+  total_batch_size: 128
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 10}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [416, 416], }
+        - {type: SanitizeBoundingBoxes, min_size: 10}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 250, 400]   # list 
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+      mosaic_prob: 0.5
+
+  collate_fn:
+    mixup_prob: 0.0
+    mixup_epochs: [40000, 15000]
+    copyblend_prob: 0.0
+    copyblend_epochs: [40000, 15000]
+
+    stop_epoch: 468 # 468 + 32
+    ema_restart_decay: 0.9999
+    base_size: 416
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 256
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [416, 416], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  num_workers: 16
+
+
+DEIMCriterion:
+  losses: ['mal', 'boxes']    # , 'local'
+  use_uni_set: False
+
+  matcher:
+    matcher_change_epoch: 450   # FIX This
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_l_coco.yml b/configs/deimv2/deimv2_hgnetv2_l_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0d94babe4b29ed306c1e1f5b1ab5352379242799
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_l_coco.yml
@@ -0,0 +1,24 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml'
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_l_coco
+
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000025
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
diff --git a/configs/deimv2/deimv2_hgnetv2_m_coco.yml b/configs/deimv2/deimv2_hgnetv2_m_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d95fcf3bb45c04ab8ca1cd199a7ce47b2a95f683
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_m_coco.yml
@@ -0,0 +1,72 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml'
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_m_coco
+
+HGNetv2:
+  name: 'B2'
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [384, 768, 1536]
+  hidden_dim: 256
+  depth_mult: 0.67
+
+DEIMTransformer:
+  num_layers: 4  # 5 6
+  eval_idx: -1  # -2 -3
+
+optimizer:
+  type: AdamW
+  params: 
+    -
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.00004
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 102 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 49, 90]   # list 
+
+  collate_fn:
+    ema_restart_decay: 0.9999
+    base_size_repeat: 6
+    mixup_epochs: [4, 49]
+    stop_epoch: 90
+    copyblend_prob: 0.5
+    copyblend_epochs: [4, 90]
+    area_threshold: 100
+    num_objects: 3
+    with_expand: True
+    expand_ratios: [0.1, 0.25]
+
+DEIMCriterion:
+  matcher:
+    # new matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 80
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_n_coco.yml b/configs/deimv2/deimv2_hgnetv2_n_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..00ceea488b1eeec50204fb3464c4c725fafcc9fe
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_n_coco.yml
@@ -0,0 +1,96 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml'
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_n_coco
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+  version: 'dfine'
+
+DEIMTransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  num_levels: 2
+  num_points: [6, 6]
+
+  num_layers: 3
+  eval_idx: -1
+
+  # FFN
+  dim_feedforward: 512
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 7800    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    copyblend_prob: 0.4
+    copyblend_epochs: [4, 78]   # CP half
+    area_threshold: 100
+    num_objects: 3
+    with_expand: True
+    expand_ratios: [0.1, 0.25]
+
+DEIMCriterion:
+  matcher:
+    # new matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 136
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_pico_coco.yml b/configs/deimv2/deimv2_hgnetv2_pico_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..71c29f65afd4db99b57d291de5789ad7d7b63240
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_pico_coco.yml
@@ -0,0 +1,128 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml',
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_pico_coco
+
+DEIM:
+  encoder: LiteEncoder
+  decoder: DEIMTransformer
+
+HGNetv2:
+  name: 'Pico'
+  return_idx: [2]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+LiteEncoder:
+  in_channels: [512]
+  feat_strides: [16]
+
+  # intra
+  hidden_dim: 112
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+  act: 'silu'
+
+
+DEIMTransformer:
+  feat_channels: [112, 112]
+  feat_strides: [16, 32]
+  hidden_dim: 112
+  num_levels: 2
+  num_points: [4, 2]
+
+  num_layers: 3
+  eval_idx: -1
+  num_queries: 200
+
+  # FFN
+  dim_feedforward: 320
+
+  # New options for DEIMv2
+  share_bbox_head: True
+  use_gateway: False
+
+# Increase to search for the optimal ema
+epoches: 500 # 468 + 32
+
+## Our LR-Scheduler
+warmup_iter: 4000
+flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 32
+lr_gamma: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0008
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0008
+      weight_decay: 0.
+    -  # not opt
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0016
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+eval_spatial_size: [640, 640]
+train_dataloader:
+  total_batch_size: 128
+  dataset: 
+    transforms:
+      ops:
+        - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
+           probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 8}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 8}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        epoch: [4, 250, 400]   # list 
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+      mosaic_prob: 0.5
+
+  collate_fn:
+    mixup_prob: 0.0
+    mixup_epochs: [40000, 15000]
+    copyblend_prob: 0.0
+    copyblend_epochs: [40000, 15000]
+    stop_epoch: 468 # 468 + 32
+    ema_restart_decay: 0.9999
+    base_size: 640
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 256
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  num_workers: 16
+
+
+DEIMCriterion:
+  losses: ['mal', 'boxes']    # , 'local'
+  use_uni_set: False
+
+  matcher:
+    matcher_change_epoch: 450   # FIX This
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_s_coco.yml b/configs/deimv2/deimv2_hgnetv2_s_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b543760f9f759823f81a7abbb6f93858e584aa87
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_s_coco.yml
@@ -0,0 +1,76 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml'
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_s_coco
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [256, 512, 1024]
+  hidden_dim: 256
+  depth_mult: 0.34
+  expansion: 0.5
+
+  version: 'dfine'
+
+DEIMTransformer:
+  num_layers: 3  # 4 5 6
+  eval_idx: -1  # -2 -3 -4
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.0002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'     # except bias
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 120]   # list 
+
+  collate_fn:
+    ema_restart_decay: 0.9999
+    base_size_repeat: 20
+    mixup_epochs: [4, 64]
+    stop_epoch: 120
+    copyblend_prob: 0.5
+    # copyblend_epochs: [4, 64]   # from v11 to v12: copy-paste continues only half epochs
+    copyblend_epochs: [4, 120]
+    area_threshold: 100
+    num_objects: 3
+    with_expand: True
+    expand_ratios: [0.1, 0.25]
+
+DEIMCriterion:
+  matcher:
+    # new matcher
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 100
\ No newline at end of file
diff --git a/configs/deimv2/deimv2_hgnetv2_x_coco.yml b/configs/deimv2/deimv2_hgnetv2_x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0355d6e314a4ef127b1c3bb25d2978a8cbedb4a5
--- /dev/null
+++ b/configs/deimv2/deimv2_hgnetv2_x_coco.yml
@@ -0,0 +1,60 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader.yml',
+  '../base/optimizer.yml',
+  '../base/deimv2.yml'
+]
+
+output_dir: ./outputs/deimv2_hgnetv2_x_coco
+
+
+HGNetv2:
+  name: 'B5'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+DEIMTransformer:
+  feat_channels: [384, 384, 384]    # [256, 256, 256]
+  reg_scale: 8   # 4
+
+  # FFN
+  dim_feedforward: 2048
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000005   
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0005
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+  
+# Increase to search for the optimal ema
+epoches: 58 # 72 + 2n
+
+## Our LR-Scheduler
+flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 8
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 29, 50]   # list 
+
+  collate_fn:
+    ema_restart_decay: 0.9998
+    base_size_repeat: 3
diff --git a/configs/deimv2_floorplan.yaml b/configs/deimv2_floorplan.yaml
deleted file mode 100644
index 458545930512c119201f0ed7924b4009783e9383..0000000000000000000000000000000000000000
--- a/configs/deimv2_floorplan.yaml
+++ /dev/null
@@ -1,189 +0,0 @@
-__include__: [
-  'coco_detection.yml',  # 同じディレクトリ内
-  # '../configs/runtime.yml',  # 存在しない場合はコメントアウト
-  # '../configs/base/dataloader.yml',  # 存在しない場合はコメントアウト
-  # '../configs/base/optimizer.yml',
-  # '../configs/base/deimv2.yml',  # 存在しない場合はコメントアウト
-]
-
-output_dir: ./outputs/deimv2_floorplan
-
-# モデル定義（engine/core.pyが参照する）
-model:
-  type: DEIM
-  backbone:
-    type: DINOv3STAs
-    name: vit_tiny
-    weights_path: ./ckpts/vitt_distill.pt
-    interaction_indexes: [3, 7, 11]
-    num_heads: 3
-    embed_dim: 192
-  encoder:
-    type: HybridEncoder
-    in_channels: [192, 192, 192]
-    depth_mult: 0.67
-    expansion: 0.34
-    hidden_dim: 192
-    dim_feedforward: 512
-  decoder:
-    type: DEIMTransformer
-    feat_channels: [192, 192, 192]
-    hidden_dim: 192
-    dim_feedforward: 512
-    num_layers: 4  # 4 5 6
-    eval_idx: -1  # -2 -3 -4
-
-# ポストプロセッサ定義（engine/core.pyが参照する）
-postprocessor:
-  type: PostProcessor
-
-# 互換性のため残す（必要に応じて）
-DEIM:
-  backbone: DINOv3STAs
-
-Model:
-  num_classes: 16
-  class_names: ["kanki", "kanki_shikaku", "kanki_regisuta", "window1", "window2", "door1", "door2", "bathtub1", "konro1", "sink1", "toilet1", "kasaikeihou1", "kasaikeihou2", "houi1", "houi2", "houi3"]
-
-# eval_spatial_sizeを明示的に設定（推論時の画像サイズ）
-eval_spatial_size: [640, 640]
-
-DINOv3STAs:
-  name: vit_tiny
-  embed_dim: 192
-  weights_path: ./ckpts/vitt_distill.pt   # 事前学習を使わないなら行ごと削除
-  interaction_indexes: [3, 7, 11]
-  num_heads: 3
-
-HybridEncoder:
-  in_channels: [192, 192, 192]
-  depth_mult: 0.67
-  expansion: 0.34
-  hidden_dim: 192
-  dim_feedforward: 512
-
-DEIMTransformer:
-  feat_channels: [192, 192, 192]
-  hidden_dim: 192
-  dim_feedforward: 512
-  num_layers: 4  # 4 5 6
-  eval_idx: -1  # -2 -3 -4
-
-
-## Optimizer
-optimizer:
-  type: AdamW
-
-  params: 
-    -
-      # except norm/bn/bias in self.dinov3
-      params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
-      lr: 0.000025
-    -
-      # including all norm/bn/bias in self.dinov3
-      params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
-      lr: 0.000025
-      weight_decay: 0.
-    - 
-      # including all norm/bn/bias except for the self.dinov3
-      params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
-      weight_decay: 0.
-
-  lr: 0.0005
-  betas: [0.9, 0.999]
-  weight_decay: 0.0001
-
-epoches: 400
-flat_epoch: 196
-no_aug_epoch: 46
-
-# optimizer.ymlから必要な設定を手動で追加
-use_amp: True
-use_ema: True
-ema:
-  type: ModelEMA
-  decay: 0.9999
-  warmups: 1000
-  start: 0
-
-clip_max_norm: 0.1
-sync_bn: True
-find_unused_parameters: True
-
-# 学習率スケジューリング設定
-# CosineAnnealingLR専用設定（パラメータを最小限に）
-lr_scheduler:
-  type: CosineAnnealingLR
-  T_max: 400
-  eta_min: 0.0000001
-
-lr_warmup_scheduler:
-  type: LinearWarmup
-  warmup_duration: 1000
-
-# 既存のflatcosineスケジューラーを無効化
-lrsheduler: null
-
-# deimv2.ymlのflatcosineスケジューラーも無効化
-lr_gamma: null
-warmup_iter: null
-flat_epoch: null
-no_aug_epoch: null
-
-
-# ---- Data Aug / Loader（図面＋640px＋OOM対策）----
-train_dataloader:
-  dataset:
-    transforms:
-      ops:
-        # 640でのピーク抑制のためMosaicは確率低め/スケール幅絞り
-        - {type: Mosaic, output_size: 640, rotation_range: 8, translation_range: [0.1, 0.1],
-           scaling_range: [0.9, 1.1], probability: 0.2, fill_value: 0, use_cache: True,
-           max_cached_images: 20, random_pop: True}
-        - {type: RandomPhotometricDistort, p: 0.2}
-        - {type: RandomZoomOut, fill: 0}
-        - {type: RandomIoUCrop, p: 0.6}
-        - {type: SanitizeBoundingBoxes, min_size: 1}
-        - {type: RandomHorizontalFlip}
-        - {type: RandomRotation, degrees: [90, 180, 270, 360], p: 0.5}  # 修正版で有効化
-        - {type: Resize, size: [640, 640]}   # ★ 640固定
-        - {type: SanitizeBoundingBoxes, min_size: 1}
-        - {type: ConvertPILImage, dtype: 'float32', scale: True}
-        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
-        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
-      policy:
-        epoch: [8, 192, 352]   # 400epochに合わせて調整
-
-  collate_fn:      # 線画での崩れ防止＆メモリ抑制
-    ema_restart_decay: 0.9999
-    base_size_repeat: 1       # ★ 1にして実質マルチスケールOFF
-    stop_epoch: 352           # 400epochの90%程度で停止
-    copyblend_epochs: [8, 352]   # 400epochに合わせて調整
-
-  # 実装が読む場合のみ有効。読まない場合は base/dataloader.yml や起動引数で制御
-  total_batch_size: 4          # ★ まずは 4 に落として安定化
-
-val_dataloader:
-  dataset:
-    transforms:
-      ops:
-        - {type: Resize, size: [640, 640]}
-        - {type: ConvertPILImage, dtype: 'float32', scale: True}
-        - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
-  total_batch_size: 6       # 評価も同程度に
-
-DEIMCriterion:
-  matcher:
-    change_matcher: True
-    iou_order_alpha: 4.0
-    matcher_change_epoch: 300
-  gamma: 1.5
-  alpha: 0.75
-  weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
-  losses: [mal, boxes, local]
-
-# 出力設定 - 最後のエポック必ず保存
-output:
-  save_last: true
-  save_interval: 5    # チェックポイント保存間隔
-  checkpoint_freq: 5   # 学習ループでの保存頻度
diff --git a/configs/runtime.yml b/configs/runtime.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8397ce1ff91246825e39f0530b544daaa0f891fe
--- /dev/null
+++ b/configs/runtime.yml
@@ -0,0 +1,20 @@
+print_freq: 500
+output_dir: './logs'
+checkpoint_freq: 12
+
+
+sync_bn: True
+find_unused_parameters: True
+
+
+use_amp: False
+scaler:
+  type: GradScaler
+  enabled: True
+
+
+use_ema: False
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 1000
diff --git a/engine/__init__.py b/engine/__init__.py
index 7278009b9c96970c981f631bbd1702f9328d159f..69baa01f55ae4799118a52fb6290ae7a2006d87a 100644
--- a/engine/__init__.py
+++ b/engine/__init__.py
@@ -1,13 +1,16 @@
-# engine package
-# モジュールをインポートしてレジストリに登録
-from . import backbone
-from . import deim
-from . import data
-from . import optim
-from . import misc
+"""
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
 
-# YAMLConfigをエクスポート
-from .core.yaml_config import YAMLConfig
+# for register purpose
+from . import optim
+from . import data
+from . import deim
 
-__all__ = ['YAMLConfig']
+from .backbone import *
 
+from .backbone import (
+    get_activation,
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+)
\ No newline at end of file
diff --git a/engine/backbone/vit_tiny.py b/engine/backbone/vit_tiny.py
index 50aa2f7b49be54f041c58582eaf99812d7af4023..e00291394d91466f41e9829c1a8b2c5f32e1e862 100644
--- a/engine/backbone/vit_tiny.py
+++ b/engine/backbone/vit_tiny.py
@@ -6,14 +6,16 @@ Modified from DINOv3 (https://github.com/facebookresearch/dinov3)
 Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py
 
 """
+import math
+import warnings
+from functools import partial
+from typing import List, Literal, Tuple
+
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functools import partial
-import math
-import numpy as np
-import warnings
-from typing import Literal, Tuple
+from torch import nn
 
 
 class RopePositionEmbedding(nn.Module):
@@ -180,11 +182,11 @@ class Attention(nn.Module):
         head_dim = dim // num_heads
         self.scale = head_dim ** -0.5
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
+        self.attn_drop = attn_drop
         self.proj = nn.Linear(dim, dim)
         self.proj_drop = nn.Dropout(proj_drop)
 
-    def forward(self, x, rope_sincos=None, register_hook=False):
+    def forward(self, x, rope_sincos=None):
         B, N, C = x.shape
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv.unbind(0)
@@ -200,13 +202,8 @@ class Attention(nn.Module):
             q = torch.cat((q_cls, q_patch), dim=2)
             k = torch.cat((k_cls, k_patch), dim=2)
 
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        if register_hook: attn.register_hook(self.save_attn_gradients)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop)
+        x = x.transpose(1, 2).reshape([B, N, C])
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -220,8 +217,8 @@ class Block(nn.Module):
         self.norm2 = norm_layer(dim)
         self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
 
-    def forward(self, x, rope_sincos=None, register_hook=False):
-        attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos, register_hook=register_hook)
+    def forward(self, x, rope_sincos=None):
+        attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos)
         x = x + self.drop_path(attn_output)
         x = x + self.drop_path(self.mlp(self.norm2(x)))
         return x
@@ -260,7 +257,6 @@ class VisionTransformer(nn.Module):
             normalize_coords="separate", shift_coords=None, jitter_coords=None,
             rescale_coords=None, dtype=None, device=None,
         )
-        
         self.init_weights()
 
     def init_weights(self):
@@ -286,28 +282,7 @@ class VisionTransformer(nn.Module):
     def feature_dim(self):
         return self.embed_dim
 
-    def forward_features(self, x, register_hook=False):
-        B, C, H, W = x.shape
-
-        x_embed = self._model.patch_embed(x)
-        cls_token = self._model.cls_token.expand(x_embed.shape[0], -1, -1)
-        x = torch.cat((cls_token, x_embed), dim=1)
-
-        patch_grid_h = H // self.patch_size
-        patch_grid_w = W // self.patch_size
-        rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
-
-        for blk in self._model.blocks:
-            x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook)
-        x = x[:, 1:, :]
-        return {'features': x.transpose(1, 2).reshape(-1, self.embed_dim, patch_grid_h, patch_grid_w)}
-    
-    def forward_pool(self, x):
-        features = self.forward_features(x)['features']
-        pooled_features = features.mean(dim=[2, 3])
-        return {'pooled_features': pooled_features}
-
-    def forward(self, x, register_hook=False):
+    def forward(self, x):
         outs = []
         B, C, H, W = x.shape
 
@@ -320,7 +295,7 @@ class VisionTransformer(nn.Module):
         rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w)
 
         for i, blk in enumerate(self._model.blocks):
-            x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook)
+            x = blk(x, rope_sincos=rope_sincos)
             if i in self.return_layers:
                 outs.append((x[:, 1:], x[:, 0]))
         return outs
diff --git a/engine/core/workspace.py b/engine/core/workspace.py
index 1d7bd693d6d89377ed722c8fdebd1a4b94c2f789..2f9d2a146605e7b8c58c92eedbd6f5eb0981372d 100644
--- a/engine/core/workspace.py
+++ b/engine/core/workspace.py
@@ -6,7 +6,6 @@ Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 import inspect
 import importlib
 import functools
-import copy
 from collections import defaultdict
 from typing import Any, Dict, Optional, List
 
@@ -14,23 +13,6 @@ from typing import Any, Dict, Optional, List
 GLOBAL_CONFIG = defaultdict(dict)
 
 
-def _safe_copy_cfg(obj):
-    """
-    deepcopy that leaves module objects untouched to avoid pickle errors.
-    """
-    if isinstance(obj, dict):
-        copied = {}
-        for k, v in obj.items():
-            if k == '_pymodule' or inspect.ismodule(v):
-                copied[k] = v
-            else:
-                copied[k] = _safe_copy_cfg(v)
-        return copied
-    if isinstance(obj, list):
-        return [_safe_copy_cfg(v) for v in obj]
-    return copy.deepcopy(obj)
-
-
 def register(dct :Any=GLOBAL_CONFIG, name=None, force=False):
     """
         dct:
@@ -110,63 +92,18 @@ def extract_schema(module: type):
 
 def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
     """
-    Create registered modules from string, type, or config dict.
     """
-    cfg_override = None
-    if isinstance(type_or_name, dict):
-        assert 'type' in type_or_name, 'config dict must have `type` key.'
-        cfg_override = copy.deepcopy(type_or_name)
-        name = cfg_override.pop('type')
-        if isinstance(name, type):
-            name = name.__name__
-    elif isinstance(type_or_name, (type, str)):
-        name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
-    else:
-        raise AssertionError('create should be modules, name, or config dict.')
-
-    # Check if module is registered in GLOBAL_CONFIG first
-    if name not in GLOBAL_CONFIG:
-        raise ValueError(
-            f'The module {name} is not registered in GLOBAL_CONFIG. '
-            f'Make sure the module is imported and registered with @register() decorator. '
-            f'Available registered modules: {list(GLOBAL_CONFIG.keys())[:20]}...'
-        )
-    
+    assert type(type_or_name) in (type, str), 'create should be modules or name.'
+
+    name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
+
     if name in global_cfg:
         if hasattr(global_cfg[name], '__dict__'):
             return global_cfg[name]
-
-    # Get config from global_cfg if available, otherwise use GLOBAL_CONFIG
-    if name in global_cfg:
-        cfg = _safe_copy_cfg(global_cfg[name])
     else:
-        cfg = _safe_copy_cfg(GLOBAL_CONFIG[name])
-    # fallback: if merged config lost registry metadata, restore from base GLOBAL_CONFIG
-    # Always check and restore metadata from GLOBAL_CONFIG if missing, regardless of global_cfg
-    if name in GLOBAL_CONFIG:
-        base = _safe_copy_cfg(GLOBAL_CONFIG[name])
-        # Restore all metadata fields if they're missing
-        if '_pymodule' not in cfg:
-            cfg['_pymodule'] = base.get('_pymodule')
-        if '_kwargs' not in cfg:
-            cfg['_kwargs'] = base.get('_kwargs', {})
-        if '_inject' not in cfg:
-            cfg['_inject'] = base.get('_inject', [])
-        if '_share' not in cfg:
-            cfg['_share'] = base.get('_share', [])
-        if '_name' not in cfg:
-            cfg['_name'] = base.get('_name', name)
-
-    # merge user overrides into registered schema
-    if cfg_override is not None:
-        if isinstance(cfg, dict):
-            _keys = [k for k in list(cfg.keys()) if not k.startswith('_')]
-            for _arg in _keys:
-                del cfg[_arg]
-            cfg.update(cfg.get('_kwargs', {}))
-            cfg.update(cfg_override)
-            cfg.update(kwargs)
-            kwargs = {}
+        raise ValueError('The module {} is not registered'.format(name))
+
+    cfg = global_cfg[name]
 
     if isinstance(cfg, dict) and 'type' in cfg:
         _cfg: dict = global_cfg[cfg['type']]
@@ -174,41 +111,18 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
         _keys = [k for k in _cfg.keys() if not k.startswith('_')]
         for _arg in _keys:
             del _cfg[_arg]
-        _cfg.update(_cfg.get('_kwargs', {})) # restore default args
+        _cfg.update(_cfg['_kwargs']) # restore default args
         _cfg.update(cfg) # load config args
         _cfg.update(kwargs)
         name = _cfg.pop('type') # pop extra key `type` (from cfg)
 
         return create(name, global_cfg)
 
-    # Safety check: ensure _pymodule exists before accessing it
-    if '_pymodule' not in cfg:
-        if name in GLOBAL_CONFIG:
-            base = _safe_copy_cfg(GLOBAL_CONFIG[name])
-            cfg['_pymodule'] = base.get('_pymodule')
-            cfg.setdefault('_kwargs', base.get('_kwargs', {}))
-            cfg.setdefault('_inject', base.get('_inject', []))
-            cfg.setdefault('_share', base.get('_share', []))
-        else:
-            raise ValueError(
-                f'The module {name} is not properly registered. '
-                f'Missing _pymodule metadata. Make sure the module is imported and registered with @register() decorator.'
-            )
-    
-    if cfg['_pymodule'] is None:
-        raise ValueError(
-            f'The module {name} has None _pymodule. '
-            f'This indicates a registration issue. Make sure the module is properly imported.'
-        )
-    
     module = getattr(cfg['_pymodule'], name)
     module_kwargs = {}
     module_kwargs.update(cfg)
 
     # shared var
-    # Safety check: ensure _share exists
-    if '_share' not in cfg:
-        cfg['_share'] = []
     for k in cfg['_share']:
         if k in global_cfg:
             module_kwargs[k] = global_cfg[k]
@@ -216,9 +130,6 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
             module_kwargs[k] = cfg[k]
 
     # inject
-    # Safety check: ensure _inject exists
-    if '_inject' not in cfg:
-        cfg['_inject'] = []
     for k in cfg['_inject']:
         _k = cfg[k]
 
@@ -244,12 +155,12 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
             if _type not in global_cfg:
                 raise ValueError(f'Missing {_type} in inspect stage.')
 
-            _cfg: dict = _safe_copy_cfg(global_cfg[_type])
+            _cfg: dict = global_cfg[_type]
             # clean args
             _keys = [k for k in _cfg.keys() if not k.startswith('_')]
             for _arg in _keys:
                 del _cfg[_arg]
-            _cfg.update(_cfg.get('_kwargs', {})) # restore default values
+            _cfg.update(_cfg['_kwargs']) # restore default values
             _cfg.update(_k) # load config args
             name = _cfg.pop('type') # pop extra key (`type` from _k)
             module_kwargs[k] = create(name, global_cfg)
diff --git a/engine/core/yaml_utils.py b/engine/core/yaml_utils.py
index 214f3d938aa46c447379247fc293a511992cb21d..411e416d4f2ceb3e54c130860be7f765ed0e28af 100644
--- a/engine/core/yaml_utils.py
+++ b/engine/core/yaml_utils.py
@@ -113,17 +113,7 @@ def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite:
                 dct[k] = another[k]
 
             elif isinstance(dct[k], dict) and isinstance(another[k], dict):
-                # 登録済みスキーマ（_pymoduleなどを持つ）を優先する
-                # anotherが登録済みスキーマの場合、メタデータを保持
-                if '_pymodule' in another[k] and '_pymodule' not in dct[k]:
-                    # 登録済みスキーマのメタデータを保持
-                    base_meta = {key: val for key, val in another[k].items() if key.startswith('_')}
-                    # YAML設定のパラメータで上書き
-                    _merge(dct[k], another[k])
-                    # メタデータを復元
-                    dct[k].update(base_meta)
-                else:
-                    _merge(dct[k], another[k])
+                _merge(dct[k], another[k])
 
             elif overwrite:
                 dct[k] = another[k]
diff --git a/engine/data/dataset/coco_dataset.py b/engine/data/dataset/coco_dataset.py
index 7faaf1288e5ec4c8f8bca8714968261b93de93fb..83202e74019f07d259ba00989744684d91498e7d 100644
--- a/engine/data/dataset/coco_dataset.py
+++ b/engine/data/dataset/coco_dataset.py
@@ -11,27 +11,14 @@ import torch.utils.data
 import torchvision
 
 from PIL import Image
-try:
-    import faster_coco_eval
-    import faster_coco_eval.core.mask as coco_mask
-    _faster_coco_eval_available = True
-except ImportError:
-    _faster_coco_eval_available = False
-    # 推論時には不要なので、ダミーオブジェクトを作成
-    class DummyFasterCocoEval:
-        @staticmethod
-        def init_as_pycocotools():
-            pass
-    faster_coco_eval = DummyFasterCocoEval()
-    coco_mask = None
-
+import faster_coco_eval
+import faster_coco_eval.core.mask as coco_mask
 from ._dataset import DetDataset
 from .._misc import convert_to_tv_tensor
 from ...core import register
 
 torchvision.disable_beta_transforms_warning()
-if _faster_coco_eval_available:
-    faster_coco_eval.init_as_pycocotools()
+faster_coco_eval.init_as_pycocotools()
 Image.MAX_IMAGE_PIXELS = None
 
 __all__ = ['CocoDetection']
diff --git a/engine/data/dataset/coco_eval.py b/engine/data/dataset/coco_eval.py
index 937807f4b111f8609a3e838bd8e7809775295659..75f6bd8ddde3fa164fbc87f499f3f3a2919f4ee5 100644
--- a/engine/data/dataset/coco_eval.py
+++ b/engine/data/dataset/coco_eval.py
@@ -11,16 +11,8 @@ import copy
 import numpy as np
 import torch
 
-try:
-    from faster_coco_eval import COCO, COCOeval_faster
-    import faster_coco_eval.core.mask as mask_util
-    _faster_coco_eval_available = True
-except ImportError:
-    _faster_coco_eval_available = False
-    COCO = None
-    COCOeval_faster = None
-    mask_util = None
-
+from faster_coco_eval import COCO, COCOeval_faster
+import faster_coco_eval.core.mask as mask_util
 from ...core import register
 from ...misc import dist_utils
 __all__ = ['CocoEvaluator',]
diff --git a/engine/data/dataset/coco_utils.py b/engine/data/dataset/coco_utils.py
index 1141581b21e27bfba0194c56636b4eb993f05847..6b81b5ea9524618fbbae575b89d42e780dc5a050 100644
--- a/engine/data/dataset/coco_utils.py
+++ b/engine/data/dataset/coco_utils.py
@@ -9,14 +9,8 @@ import torch
 import torch.utils.data
 import torchvision
 import torchvision.transforms.functional as TVF
-try:
-    import faster_coco_eval.core.mask as coco_mask
-    from faster_coco_eval import COCO
-    _faster_coco_eval_available = True
-except ImportError:
-    _faster_coco_eval_available = False
-    coco_mask = None
-    COCO = None
+import faster_coco_eval.core.mask as coco_mask
+from faster_coco_eval import COCO
 
 
 def convert_coco_poly_to_mask(segmentations, height, width):
diff --git a/engine/data/transforms/_transforms.py b/engine/data/transforms/_transforms.py
index a4442863ce25eed604163117f5ecf0f6e34ceb11..31588df5203041730da89b7231479b5b4fc92f20 100644
--- a/engine/data/transforms/_transforms.py
+++ b/engine/data/transforms/_transforms.py
@@ -114,55 +114,6 @@ class ConvertBoxes(T.Transform):
         return inpt
 
 
-@register()
-class RandomRotation(T.Transform):
-    _transformed_types = (
-        PIL.Image.Image,
-        Image,
-        Video,
-        Mask,
-        BoundingBoxes,
-    )
-    
-    def __init__(self, degrees, p=1.0) -> None:
-        super().__init__()
-        if isinstance(degrees, (int, float)):
-            degrees = [degrees]
-        self.degrees = degrees
-        self.p = p
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        if torch.rand(1) >= self.p:
-            return {"angle": 0}
-        
-        angle = torch.tensor(self.degrees)[torch.randint(0, len(self.degrees), (1,))].item()
-        return {"angle": angle}
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        angle = params["angle"]
-        if angle == 0:
-            return inpt
-        
-        # 型チェックを追加して、サポートされている型のみを処理
-        if hasattr(inpt, '__class__') and inpt.__class__.__name__ in ['Image', 'BoundingBoxes', 'Mask', 'Video']:
-            return F.rotate(inpt, angle=angle, fill=0)
-        else:
-            # サポートされていない型の場合はそのまま返す
-            return inpt
-
-    def __call__(self, *inputs: Any) -> Any:
-        if len(inputs) == 1:
-            return self._transform(inputs[0], self._get_params([inputs[0]]))
-        else:
-            params = self._get_params(inputs)
-            # 各入力に対して個別に変換を適用し、結果をリストに格納
-            results = []
-            for inpt in inputs:
-                result = self._transform(inpt, params)
-                results.append(result)
-            return tuple(results)
-
-
 @register()
 class ConvertPILImage(T.Transform):
     _transformed_types = (
diff --git a/engine/deim/hybrid_encoder.py b/engine/deim/hybrid_encoder.py
index 2752157c52f21c7eb8842d621dd9a803b637ac6a..77a74725bc966f130837ff7edbaa7a0730497ff6 100644
--- a/engine/deim/hybrid_encoder.py
+++ b/engine/deim/hybrid_encoder.py
@@ -199,9 +199,10 @@ class RepNCSPELAN4(nn.Module):
         super().__init__()
         self.c = c3//2
         self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act)
-        CSPLayer_cls = CSPLayer2 if csp_type == 'csp2' else CSPLayer
-        self.cv2 = nn.Sequential(CSPLayer_cls(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
-        self.cv3 = nn.Sequential(CSPLayer_cls(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        if csp_type == 'csp2':
+            CSPLayer = CSPLayer2
+        self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
         self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act)
 
     def forward_chunk(self, x):
diff --git a/tools/benchmark/dataset.py b/tools/benchmark/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..76fa6491bac0719b0ad0bffe844d51a900038227
--- /dev/null
+++ b/tools/benchmark/dataset.py
@@ -0,0 +1,105 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import glob
+from PIL import Image
+
+import torch
+import torch.utils.data as data
+import torchvision
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+Image.MAX_IMAGE_PIXELS = None
+
+class ToTensor(T.ToTensor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __call__(self, pic):
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+class PadToSize(T.Pad):
+    def __init__(self, size, fill=0, padding_mode='constant'):
+        super().__init__(0, fill, padding_mode)
+        self.size = size
+        self.fill = fill
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be padded.
+
+        Returns:
+            PIL Image or Tensor: Padded image.
+        """
+        w, h = F.get_image_size(img)
+        padding = (0, 0, self.size[0] - w, self.size[1] - h)
+        return F.pad(img, padding, self.fill, self.padding_mode)
+
+
+class Dataset(data.Dataset):
+    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
+        super().__init__()
+
+        self.device = device
+        self.size = 640
+
+        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
+
+        if preprocess is None:
+            self.preprocess = T.Compose([
+                    T.Resize(size=639, max_size=640),
+                    PadToSize(size=(640, 640), fill=114),
+                    ToTensor(),
+                    T.ConvertImageDtype(torch.float),
+            ])
+        else:
+            self.preprocess = preprocess
+
+    def __len__(self, ):
+        return len(self.im_path_list)
+
+    def __getitem__(self, index):
+        # im = Image.open(self.img_path_list[index]).convert('RGB')
+        im = torchvision.io.read_file(self.im_path_list[index])
+        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
+        _, h, w = im.shape # c,h,w
+
+        im = self.preprocess(im)
+
+        blob = {
+            'images': im,
+            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
+            'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
+            'orig_target_sizes': torch.tensor([w, h]).to(im.device),
+        }
+
+        return blob
+
+    @staticmethod
+    def post_process():
+        pass
+
+    @staticmethod
+    def collate_fn():
+        pass
+
+
+def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
+    '''show result
+    Keys:
+        'num_dets', 'det_boxes', 'det_scores', 'det_classes'
+    '''
+    for i in range(blob['image'].shape[0]):
+        det_scores = outputs['det_scores'][i]
+        det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
+
+        im = (blob['image'][i] * 255).to(torch.uint8)
+        im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
+        Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
diff --git a/tools/benchmark/get_info.py b/tools/benchmark/get_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72efa35b599f2bd8d4b8440e505ba5c6ec8f2ca
--- /dev/null
+++ b/tools/benchmark/get_info.py
@@ -0,0 +1,50 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
+
+import argparse
+from calflops import calculate_flops
+from engine.core import YAMLConfig
+
+import torch
+import torch.nn as nn
+
+def custom_repr(self):
+    return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
+original_repr = torch.Tensor.__repr__
+torch.Tensor.__repr__ = custom_repr
+
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=None)
+    class Model_for_flops(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+
+        def forward(self, images):
+            outputs = self.model(images)
+            return outputs
+
+    model = Model_for_flops().eval()
+
+    flops, macs, _ = calculate_flops(model=model,
+                                     input_shape=(1, 3, 640, 640),
+                                     output_as_string=True,
+                                     output_precision=4)
+    params = sum(p.numel() for p in model.parameters())
+    print("Model FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', default= "configs/dfine/dfine_hgnetv2_l_coco.yml", type=str)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tools/benchmark/requirements.txt b/tools/benchmark/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..55a3c0f0a63f0ad81e1d41a19753d7c550231c27
--- /dev/null
+++ b/tools/benchmark/requirements.txt
@@ -0,0 +1,6 @@
+onnxruntime
+tensorrt
+pycuda
+calflops
+tqdm
+# onnx_graphsurgeon # for YOLOs
diff --git a/tools/benchmark/trt_benchmark.py b/tools/benchmark/trt_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a650ac06f62b74034ee39a9ae66edb046486ad7c
--- /dev/null
+++ b/tools/benchmark/trt_benchmark.py
@@ -0,0 +1,207 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import tensorrt as trt
+import pycuda.driver as cuda
+from utils import TimeProfiler
+import numpy as np
+import os
+import time
+import torch
+
+from collections import namedtuple, OrderedDict
+import glob
+import argparse
+from dataset import Dataset
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Argument Parser Example')
+    parser.add_argument('--COCO_dir',
+                        type=str,
+                        default='/data/COCO2017/val2017',
+                        help="Directory for images to perform inference on.")
+    parser.add_argument("--engine_dir",
+                        type=str,
+                        help="Directory containing model engine files.")
+    parser.add_argument('--busy',
+                        action='store_true',
+                        help="Flag to indicate that other processes may be running.")
+    args = parser.parse_args()
+    return args
+
+class TRTInference(object):
+    def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.backend = backend
+        self.max_batch_size = max_batch_size
+
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
+        self.engine = self.load_engine(engine_path)
+        self.context = self.engine.create_execution_context()
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+
+        if self.backend == 'cuda':
+            self.stream = cuda.Stream()
+        self.time_profile = TimeProfiler()
+        self.time_profile_dataset = TimeProfiler()
+
+    def init(self):
+        self.dynamic = False
+
+    def load_engine(self, path):
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+
+    def get_input_names(self):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+
+    def get_output_names(self):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+
+    def get_bindings(self, engine, context, max_batch_size=32, device=None):
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+
+            if shape[0] == -1:
+                dynamic = True
+                shape[0] = max_batch_size
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    context.set_input_shape(name, shape)
+
+            if self.backend == 'cuda':
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    data = np.random.randn(*shape).astype(dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr)
+                else:
+                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr)
+            else:
+                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+        return bindings
+
+    def run_torch(self, blob):
+        for n in self.input_names:
+            if self.bindings[n].shape != blob[n].shape:
+                self.context.set_input_shape(n, blob[n].shape)
+                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
+
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+        return outputs
+
+    def async_run_cuda(self, blob):
+        for n in self.input_names:
+            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
+
+        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
+        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
+
+        outputs = {}
+        for n in self.output_names:
+            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
+            outputs[n] = self.bindings[n].data
+
+        self.stream.synchronize()
+
+        return outputs
+
+    def __call__(self, blob):
+        if self.backend == 'torch':
+            return self.run_torch(blob)
+        elif self.backend == 'cuda':
+            return self.async_run_cuda(blob)
+
+    def synchronize(self):
+        if self.backend == 'torch' and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elif self.backend == 'cuda':
+            self.stream.synchronize()
+
+    def warmup(self, blob, n):
+        for _ in range(n):
+            _ = self(blob)
+
+    def speed(self, blob, n, nonempty_process=False):
+        times = []
+        self.time_profile_dataset.reset()
+        for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
+            self.time_profile.reset()
+            with self.time_profile_dataset:
+                img = blob[i]
+                if img['images'] is not None:
+                    img['image'] = img['input'] = img['images'].unsqueeze(0)
+                else:
+                    img['images'] = img['input'] = img['image'].unsqueeze(0)
+            with self.time_profile:
+                _ = self(img)
+            times.append(self.time_profile.total)
+
+        # end-to-end model only
+        times = sorted(times)
+        if len(times) > 100 and nonempty_process:
+            times = times[:100]
+
+        avg_time = sum(times) / len(times)  # Calculate the average of the remaining times
+        return avg_time
+
+def main():
+    FLAGS = parse_args()
+    dataset = Dataset(FLAGS.infer_dir)
+    im = torch.ones(1, 3, 640, 640).cuda()
+    blob = {
+            'image': im,
+            'images': im,
+            'input': im,
+            'im_shape': torch.tensor([640, 640]).to(im.device),
+            'scale_factor': torch.tensor([1, 1]).to(im.device),
+            'orig_target_sizes': torch.tensor([640, 640]).to(im.device),
+        }
+
+    engine_files = glob.glob(os.path.join(FLAGS.models_dir, "*.engine"))
+    results = []
+
+    for engine_file in engine_files:
+        print(f"Testing engine: {engine_file}")
+        model = TRTInference(engine_file, max_batch_size=1, verbose=False)
+        model.init()
+        model.warmup(blob, 1000)
+        t = []
+        for _ in range(1):
+            t.append(model.speed(dataset, 1000, FLAGS.busy))
+        avg_latency = 1000 * torch.tensor(t).mean()
+        results.append((engine_file, avg_latency))
+        print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms")
+
+        del model
+        torch.cuda.empty_cache()
+        time.sleep(1)
+
+    sorted_results = sorted(results, key=lambda x: x[1])
+    for engine_file, latency in sorted_results:
+        print(f"Engine: {engine_file}, Latency: {latency:.2f} ms")
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/benchmark/utils.py b/tools/benchmark/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e1800a2deaf84f3fec46c1a40b6e29f4772719
--- /dev/null
+++ b/tools/benchmark/utils.py
@@ -0,0 +1,80 @@
+import time
+import contextlib
+import numpy as np
+from PIL import Image
+from collections import OrderedDict
+
+import onnx
+import torch
+import onnx_graphsurgeon
+
+
+def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
+    '''--loadInputs='image:input_tensor.bin'
+    '''
+    im = Image.open(path).resize(size)
+    data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
+    data.tofile(output_name)
+
+
+def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
+    '''
+    http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
+    https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
+    '''
+    onnx_model = onnx.load(path)
+
+    if simplify:
+        from onnxsim import simplify
+        onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
+
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    topk = max_output_boxes
+    attrs = OrderedDict(plugin_version='1',
+                        background_class=-1,
+                        max_output_boxes=topk,
+                        score_threshold=score_threshold,
+                        iou_threshold=iou_threshold,
+                        score_activation=False,
+                        box_coding=0, )
+
+    outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
+               onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
+               onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
+               onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
+
+    graph.layer(op='EfficientNMS_TRT',
+                name="batched_nms",
+                inputs=[graph.outputs[0],
+                        graph.outputs[1]],
+                outputs=outputs,
+                attrs=attrs, )
+
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), 'yolo_w_nms.onnx')
+
+
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self, ):
+        self.total = 0
+
+    def __enter__(self, ):
+        self.start = self.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+
+    def reset(self, ):
+        self.total = 0
+
+    def time(self, ):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.time()
diff --git a/tools/dataset/remap_obj365.py b/tools/dataset/remap_obj365.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76214e7a05b5c158deaecbe4f4994e020bf8226
--- /dev/null
+++ b/tools/dataset/remap_obj365.py
@@ -0,0 +1,139 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import json
+import os
+import argparse
+
+
+def update_image_paths(images, new_prefix):
+    print('Updating image paths with new prefix...')
+    for img in images:
+        split = img['file_name'].split('/')[1:]
+        img['file_name'] = os.path.join(new_prefix, *split)
+    print('Image paths updated.')
+    return images
+
+def create_split_annotations(original_annotations, split_image_ids, new_prefix, output_file):
+    print(f'Creating split annotations for {output_file}...')
+    new_images = [img for img in original_annotations['images'] if img['id'] in split_image_ids]
+    print(f'Number of images selected: {len(new_images)}')
+    if new_prefix is not None:
+        new_images = update_image_paths(new_images, new_prefix)
+
+    new_annotations = {
+        'images': new_images,
+        'annotations': [ann for ann in original_annotations['annotations'] if ann['image_id'] in split_image_ids],
+        'categories': original_annotations['categories']
+    }
+    print(f'Number of annotations selected: {len(new_annotations["annotations"])}')
+    with open(output_file, 'w') as f:
+        json.dump(new_annotations, f)
+    print(f'Annotations saved to {output_file}')
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Split and update dataset annotations.')
+    parser.add_argument(
+        '--base_dir',
+        type=str,
+        default='/datassd/objects365',
+        help='Base directory of the dataset, e.g., /data/Objects365/data'
+    )
+    parser.add_argument(
+        '--new_val_size',
+        type=int,
+        default=5000,
+        help='Number of images to include in the new validation set (default: 5000)'
+    )
+    parser.add_argument(
+        '--output_suffix',
+        type=str,
+        default='new',
+        help='Suffix to add to new annotation files (default: new)'
+    )
+    return parser.parse_args()
+
+def main():
+    args = parse_arguments()
+    base_dir = args.base_dir
+    new_val_size = args.new_val_size
+    output_suffix = args.output_suffix
+
+    # Define paths based on the base directory
+    original_train_ann_file = os.path.join(base_dir, 'train', 'zhiyuan_objv2_train.json')
+    original_val_ann_file = os.path.join(base_dir, 'val', 'zhiyuan_objv2_val.json')
+
+    new_val_ann_file = os.path.join(base_dir, 'val', f'{output_suffix}_zhiyuan_objv2_val.json')
+    new_train_ann_file = os.path.join(base_dir, 'train', f'{output_suffix}_zhiyuan_objv2_train.json')
+
+    # Check if original annotation files exist
+    if not os.path.isfile(original_train_ann_file):
+        print(f'Error: Training annotation file not found at {original_train_ann_file}')
+        return
+    if not os.path.isfile(original_val_ann_file):
+        print(f'Error: Validation annotation file not found at {original_val_ann_file}')
+        return
+
+    # Load the original training and validation annotations
+    print('Loading original training annotations...')
+    with open(original_train_ann_file, 'r') as f:
+        train_annotations = json.load(f)
+    print('Training annotations loaded.')
+
+    print('Loading original validation annotations...')
+    with open(original_val_ann_file, 'r') as f:
+        val_annotations = json.load(f)
+    print('Validation annotations loaded.')
+
+    # Extract image IDs from the original validation set
+    print('Extracting image IDs from the validation set...')
+    val_image_ids = [img['id'] for img in val_annotations['images']]
+    print(f'Total validation images: {len(val_image_ids)}')
+
+    # Split image IDs for the new training and validation sets
+    print(f'Splitting validation images into new validation set of size {new_val_size} and training set...')
+    new_val_image_ids = val_image_ids[:new_val_size]
+    new_train_image_ids = val_image_ids[new_val_size:]
+    print(f'New validation set size: {len(new_val_image_ids)}')
+    print(f'New training set size from validation images: {len(new_train_image_ids)}')
+
+    # Create new validation annotation file
+    print('Creating new validation annotations...')
+    create_split_annotations(val_annotations, new_val_image_ids, None, new_val_ann_file)
+    print('New validation annotations created.')
+
+    # Combine the remaining validation images and annotations with the original training data
+    print('Preparing new training images and annotations...')
+    new_train_images = [img for img in val_annotations['images'] if img['id'] in new_train_image_ids]
+    print(f'Number of images from validation to add to training: {len(new_train_images)}')
+    new_train_images = update_image_paths(new_train_images, 'images_from_val')
+    new_train_annotations = [ann for ann in val_annotations['annotations'] if ann['image_id'] in new_train_image_ids]
+    print(f'Number of annotations from validation to add to training: {len(new_train_annotations)}')
+
+    # Add the original training images and annotations
+    print('Adding original training images and annotations...')
+    new_train_images.extend(train_annotations['images'])
+    new_train_annotations.extend(train_annotations['annotations'])
+    print(f'Total training images: {len(new_train_images)}')
+    print(f'Total training annotations: {len(new_train_annotations)}')
+
+    # Create a new training annotation dictionary
+    print('Creating new training annotations dictionary...')
+    new_train_annotations_dict = {
+        'images': new_train_images,
+        'annotations': new_train_annotations,
+        'categories': train_annotations['categories']
+    }
+    print('New training annotations dictionary created.')
+
+    # Save the new training annotations
+    print('Saving new training annotations...')
+    with open(new_train_ann_file, 'w') as f:
+        json.dump(new_train_annotations_dict, f)
+    print(f'New training annotations saved to {new_train_ann_file}')
+
+    print('Processing completed successfully.')
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset/resize_obj365.py b/tools/dataset/resize_obj365.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14fd865ef5e260d0d6e8b26f32daddc8c088b69
--- /dev/null
+++ b/tools/dataset/resize_obj365.py
@@ -0,0 +1,147 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import os
+import json
+from PIL import Image
+from concurrent.futures import ThreadPoolExecutor
+import argparse
+
+
+def resize_image_and_update_annotations(image_path, annotations, max_size=640):
+    print(f"Processing image: {image_path}")
+    try:
+        with Image.open(image_path) as img:
+            w, h = img.size
+            if max(w, h) <= max_size:
+                return annotations, w, h, False  # No need to resize
+
+            scale = max_size / max(w, h)
+            new_w = int(w * scale)
+            new_h = int(h * scale)
+            print(f"Resizing image to width={new_w}, height={new_h}")
+
+            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            new_image_path = image_path.replace('.jpg', '_resized{}.jpg'.format(max_size))
+            img.save(new_image_path)
+            print(f"Resized image saved: {new_image_path}")
+            print(f"Original size: ({w}, {h}), New size: ({new_w}, {new_h})")
+
+            # Update annotations
+            for ann in annotations:
+                ann['area'] = ann['area'] * (scale ** 2)
+                ann['bbox'] = [coord * scale for coord in ann['bbox']]
+                if 'orig_size' in ann:
+                    ann['orig_size'] = (new_w, new_h)
+                if 'size' in ann:
+                    ann['size'] = (new_w, new_h)
+
+    except Exception as e:
+        print(f"Error processing {image_path}: {e}")
+        return None
+
+    return annotations, new_w, new_h, True
+
+def resize_images_and_update_annotations(base_dir, subset, max_size=640, num_workers=4):
+    print(f"Starting to resize images and update annotations for subset: {subset}")
+    json_file = os.path.join(base_dir, subset, 'new_zhiyuan_objv2_{}.json'.format(subset))
+    if not os.path.isfile(json_file):
+        print(f'Error: JSON file not found at {json_file}')
+        return
+
+    print(f"Loading JSON file: {json_file}")
+    with open(json_file, 'r') as f:
+        data = json.load(f)
+    print("JSON file loaded.")
+
+    print("Preparing image annotations mapping...")
+    image_annotations = {img['id']: [] for img in data['images']}
+    for ann in data['annotations']:
+        image_annotations[ann['image_id']].append(ann)
+    print("Image annotations mapping prepared.")
+
+    def process_image(image_info):
+        image_path = os.path.join(base_dir, subset, image_info['file_name'])
+        results = resize_image_and_update_annotations(image_path, image_annotations[image_info['id']], max_size)
+        if results is None:
+            updated_annotations, new_w, new_h, resized = None, None, None, None
+        else:
+            updated_annotations, new_w, new_h, resized = results
+        return image_info, updated_annotations, new_w, new_h, resized
+
+    print(f"Processing images with {num_workers} worker threads...")
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        results = list(executor.map(process_image, data['images']))
+    print("Image processing completed.")
+
+    new_images = []
+    new_annotations = []
+
+    print("Updating image and annotation data...")
+    for image_info, updated_annotations, new_w, new_h, resized in results:
+        if updated_annotations is not None:
+            image_info['width'] = new_w
+            image_info['height'] = new_h
+            image_annotations[image_info['id']] = updated_annotations
+            if resized:
+                image_info['file_name'] = image_info['file_name'].replace('.jpg', '_resized{}.jpg'.format(max_size))
+            new_images.append(image_info)
+            new_annotations.extend(updated_annotations)
+    print(f"Total images processed: {len(new_images)}")
+    print(f"Total annotations updated: {len(new_annotations)}")
+
+    new_data = {
+        'images': new_images,
+        'annotations': new_annotations,
+        'categories': data['categories']
+    }
+
+    new_json_file = json_file.replace('.json', '_resized{}.json'.format(max_size))
+    print('Saving new training annotations...')
+    with open(new_json_file, 'w') as f:
+        json.dump(new_data, f)
+    print(f'New JSON file saved to {new_json_file}')
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Resize images and update dataset annotations for both train and val sets.')
+    parser.add_argument(
+        '--base_dir',
+        type=str,
+        default='/datassd/objects365',
+        help='Base directory of the dataset, e.g., /data/Objects365/data'
+    )
+    parser.add_argument(
+        '--max_size',
+        type=int,
+        default=640,
+        help='Maximum size for the longer side of the image (default: 640)'
+    )
+    parser.add_argument(
+        '--num_workers',
+        type=int,
+        default=4,
+        help='Number of worker threads for parallel processing (default: 4)'
+    )
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_arguments()
+    base_dir = args.base_dir
+    max_size = args.max_size
+    num_workers = args.num_workers
+
+    subsets = ['train', 'val']
+    for subset in subsets:
+        print(f'Processing subset: {subset}')
+        resize_images_and_update_annotations(
+            base_dir=base_dir,
+            subset=subset,
+            max_size=max_size,
+            num_workers=num_workers
+        )
+    print("All subsets processed.")
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/deployment/export_onnx.py b/tools/deployment/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..186fda35f2a319dc841bcd2d38752df5c851b1b6
--- /dev/null
+++ b/tools/deployment/export_onnx.py
@@ -0,0 +1,109 @@
+"""
+DEIMv2: Real-Time Object Detection Meets DINOv3
+Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
+
+import torch
+import torch.nn as nn
+
+from engine.core import YAMLConfig
+
+
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if 'HGNetv2' in cfg.yaml_cfg:
+        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+
+        # NOTE load train mode state -> convert to deploy mode
+        cfg.model.load_state_dict(state)
+
+    else:
+        # raise AttributeError('Only support resume to load model.state_dict by now.')
+        print('not load model.state_dict, use default init state dict...')
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    model = Model()
+
+    img_size = cfg.yaml_cfg["eval_spatial_size"]
+    data = torch.rand(32, 3, *img_size)
+    size = torch.tensor([img_size])
+    _ = model(data, size)
+
+    dynamic_axes = {
+        'images': {0: 'N', },
+        'orig_target_sizes': {0: 'N'}
+    }
+
+    output_file = args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx'
+
+    torch.onnx.export(
+        model,
+        (data, size),
+        output_file,
+        input_names=['images', 'orig_target_sizes'],
+        output_names=['labels', 'boxes', 'scores'],
+        dynamic_axes=dynamic_axes,
+        opset_version=args.opset,
+        verbose=False,
+        do_constant_folding=True,
+    )
+
+    if args.check:
+        import onnx
+        onnx_model = onnx.load(output_file)
+        onnx.checker.check_model(onnx_model)
+        print('Check export onnx model done...')
+
+    if args.simplify:
+        import onnx
+        import onnxsim
+        dynamic = True
+        # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
+        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
+        onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes)
+        onnx.save(onnx_model_simplify, output_file)
+        print(f'Simplify onnx model {check}...')
+
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', default='configs/dfine/dfine_hgnetv2_l_coco.yml', type=str, )
+    parser.add_argument('--resume', '-r', type=str, )
+    parser.add_argument('--opset', type=int, default=17,)
+    parser.add_argument('--check',  action='store_true')
+    parser.add_argument('--simplify',  action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/deployment/export_yolo_w_nms.py b/tools/deployment/export_yolo_w_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c89b213d9436cba49c093d4427757b907a5e03
--- /dev/null
+++ b/tools/deployment/export_yolo_w_nms.py
@@ -0,0 +1,74 @@
+import torch
+import torchvision
+
+import numpy as np
+import onnxruntime as ort
+
+from utils import yolo_insert_nms
+
+class YOLO11(torch.nn.Module):
+    def __init__(self, name) -> None:
+        super().__init__()
+        from ultralytics import YOLO
+        # Load a model
+        # build a new model from scratch
+        # model = YOLO(f'{name}.yaml')
+
+        # load a pretrained model (recommended for training)
+        model = YOLO("yolo11n.pt")
+        self.model = model.model
+
+    def forward(self, x):
+        '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
+        '''
+        pred: torch.Tensor = self.model(x)[0] # n 84 8400,
+        pred = pred.permute(0, 2, 1)
+        boxes, scores = pred.split([4, 80], dim=-1)
+        boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+
+        return boxes, scores
+
+
+
+def export_onnx(name='yolov8n'):
+    '''export onnx
+    '''
+    m = YOLO11(name)
+
+    x = torch.rand(1, 3, 640, 640)
+    dynamic_axes = {
+        'image': {0: '-1'}
+    }
+    torch.onnx.export(m, x, f'{name}.onnx',
+                      input_names=['image'],
+                      output_names=['boxes', 'scores'],
+                      opset_version=13,
+                      dynamic_axes=dynamic_axes)
+
+    data = np.random.rand(1, 3, 640, 640).astype(np.float32)
+    sess = ort.InferenceSession(f'{name}.onnx')
+    _ = sess.run(output_names=None, input_feed={'image': data})
+
+    import onnx
+    import onnxslim
+    model_onnx = onnx.load(f'{name}.onnx')
+    model_onnx = onnxslim.slim(model_onnx)
+    onnx.save(model_onnx, f'{name}.onnx')
+
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', type=str, default='yolo11n_tuned')
+    parser.add_argument('--score_threshold', type=float, default=0.01)
+    parser.add_argument('--iou_threshold', type=float, default=0.6)
+    parser.add_argument('--max_output_boxes', type=int, default=300)
+    args = parser.parse_args()
+
+    export_onnx(name=args.name)
+
+    yolo_insert_nms(path=f'{args.name}.onnx',
+                    score_threshold=args.score_threshold,
+                    iou_threshold=args.iou_threshold,
+                    max_output_boxes=args.max_output_boxes, )
diff --git a/tools/inference/onnx_inf.py b/tools/inference/onnx_inf.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0016019cd43a9b8d64c454493c7c4283a5b0d4
--- /dev/null
+++ b/tools/inference/onnx_inf.py
@@ -0,0 +1,175 @@
+"""
+DEIMv2: Real-Time Object Detection Meets DINOv3
+Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import cv2
+import numpy as np
+import onnxruntime as ort
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageDraw
+
+
+def resize_with_aspect_ratio(image, size, interpolation=Image.BILINEAR):
+    """Resizes an image while maintaining aspect ratio and pads it."""
+    original_width, original_height = image.size
+    ratio = min(size / original_width, size / original_height)
+    new_width = int(original_width * ratio)
+    new_height = int(original_height * ratio)
+    image = image.resize((new_width, new_height), interpolation)
+
+    # Create a new image with the desired size and paste the resized image onto it
+    new_image = Image.new("RGB", (size, size))
+    new_image.paste(image, ((size - new_width) // 2, (size - new_height) // 2))
+    return new_image, ratio, (size - new_width) // 2, (size - new_height) // 2
+
+
+def draw(images, labels, boxes, scores, ratios, paddings, thrh=0.4):
+    result_images = []
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scr = scr[scr > thrh]
+
+        ratio = ratios[i]
+        pad_w, pad_h = paddings[i]
+
+        for lbl, bb in zip(lab, box):
+            # Adjust bounding boxes according to the resizing and padding
+            bb = [
+                (bb[0] - pad_w) / ratio,
+                (bb[1] - pad_h) / ratio,
+                (bb[2] - pad_w) / ratio,
+                (bb[3] - pad_h) / ratio,
+            ]
+            draw.rectangle(bb, outline='red')
+            draw.text((bb[0], bb[1]), text=str(lbl), fill='blue')
+
+        result_images.append(im)
+    return result_images
+
+
+def process_image(sess, im_pil, size=640, model_size='s'):
+    # Resize image while preserving aspect ratio
+    resized_im_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(im_pil, size)
+    orig_size = torch.tensor([[resized_im_pil.size[1], resized_im_pil.size[0]]])
+
+    transforms = T.Compose([
+            T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                if model_size not in ['atto', 'femto', 'pico', 'n'] 
+                else T.Lambda(lambda x: x)
+        ])
+    im_data = transforms(resized_im_pil).unsqueeze(0)
+
+    output = sess.run(
+        output_names=None,
+        input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()}
+    )
+
+    labels, boxes, scores = output
+
+    result_images = draw(
+        [im_pil], labels, boxes, scores,
+        [ratio], [(pad_w, pad_h)]
+    )
+    result_images[0].save('onnx_result.jpg')
+    print("Image processing complete. Result saved as 'result.jpg'.")
+
+
+def process_video(sess, video_path, size=640, model_size='s'):
+    cap = cv2.VideoCapture(video_path)
+
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    # Define the codec and create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter('onnx_result.mp4', fourcc, fps, (orig_w, orig_h))
+
+    frame_count = 0
+    print("Processing video frames...")
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        # Convert frame to PIL image
+        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+        # Resize frame while preserving aspect ratio
+        resized_frame_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(frame_pil, size)
+        orig_size = torch.tensor([[resized_frame_pil.size[1], resized_frame_pil.size[0]]])
+
+        transforms = T.Compose([
+                T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                    if model_size not in ['atto', 'femto', 'pico', 'n'] 
+                    else T.Lambda(lambda x: x)
+            ])
+        im_data = transforms(resized_frame_pil).unsqueeze(0)
+
+        output = sess.run(
+            output_names=None,
+            input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()}
+        )
+
+        labels, boxes, scores = output
+
+        # Draw detections on the original frame
+        result_images = draw(
+            [frame_pil], labels, boxes, scores,
+            [ratio], [(pad_w, pad_h)]
+        )
+        frame_with_detections = result_images[0]
+
+        # Convert back to OpenCV image
+        frame = cv2.cvtColor(np.array(frame_with_detections), cv2.COLOR_RGB2BGR)
+
+        # Write the frame
+        out.write(frame)
+        frame_count += 1
+
+        if frame_count % 10 == 0:
+            print(f"Processed {frame_count} frames...")
+
+    cap.release()
+    out.release()
+    print("Video processing complete. Result saved as 'result.mp4'.")
+
+
+def main(args):
+    """Main function."""
+    # Load the ONNX model
+    sess = ort.InferenceSession(args.onnx)
+    size = sess.get_inputs()[0].shape[2]
+    print(f"Using device: {ort.get_device()}")
+
+    input_path = args.input
+
+    try:
+        # Try to open the input as an image
+        im_pil = Image.open(input_path).convert('RGB')
+        process_image(sess, im_pil, size, args.model_size)
+    except IOError:
+        # Not an image, process as video
+        process_video(sess, input_path, size, args.model_size)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--onnx', type=str, required=True, help='Path to the ONNX model file.')
+    parser.add_argument('--input', type=str, required=True, help='Path to the input image or video file.')
+    parser.add_argument('-ms', '--model-size', type=str, required=True, choices=['atto', 'femto', 'pico', 'n', 's', 'm', 'l', 'x'], 
+                        help='Model size')
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/inference/openvino_inf.py b/tools/inference/openvino_inf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a66755a256f56a5594508d003d1820df23fd2e3
--- /dev/null
+++ b/tools/inference/openvino_inf.py
@@ -0,0 +1,7 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO
diff --git a/tools/inference/requirements.txt b/tools/inference/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46a470c8805cfc80a3bf4c4cb6a87dacc237021f
--- /dev/null
+++ b/tools/inference/requirements.txt
@@ -0,0 +1,2 @@
+onnxruntime
+tensorrt
diff --git a/tools/inference/torch_inf.py b/tools/inference/torch_inf.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e090016bc54870482d9bb67a22af5b1ef27227
--- /dev/null
+++ b/tools/inference/torch_inf.py
@@ -0,0 +1,167 @@
+"""
+DEIMv2: Real-Time Object Detection Meets DINOv3
+Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import os
+import sys
+
+import cv2  # Added for video processing
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image, ImageDraw
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from engine.core import YAMLConfig
+
+
+def draw(images, labels, boxes, scores, thrh=0.45):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scrs = scr[scr > thrh]
+
+        for j, b in enumerate(box):
+            draw.rectangle(list(b), outline='red')
+            draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", fill='blue', )
+
+        im.save('torch_results.jpg')
+
+
+def process_image(model, device, file_path, size=(640, 640), vit_backbone=False):
+    im_pil = Image.open(file_path).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([[w, h]]).to(device)
+
+    transforms = T.Compose([
+        T.Resize(size),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                if vit_backbone else T.Lambda(lambda x: x)
+    ])
+    im_data = transforms(im_pil).unsqueeze(0).to(device)
+
+    output = model(im_data, orig_size)
+    labels, boxes, scores = output
+
+    draw([im_pil], labels, boxes, scores)
+
+
+def process_video(model, device, file_path, size=(640, 640), vit_backbone=False):
+    cap = cv2.VideoCapture(file_path)
+
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    # Define the codec and create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h))
+
+    transforms = T.Compose([
+        T.Resize(size),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                if vit_backbone else T.Lambda(lambda x: x)
+    ])
+
+    frame_count = 0
+    print("Processing video frames...")
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        # Convert frame to PIL image
+        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+        w, h = frame_pil.size
+        orig_size = torch.tensor([[w, h]]).to(device)
+
+        im_data = transforms(frame_pil).unsqueeze(0).to(device)
+
+        output = model(im_data, orig_size)
+        labels, boxes, scores = output
+
+        # Draw detections on the frame
+        draw([frame_pil], labels, boxes, scores)
+
+        # Convert back to OpenCV image
+        frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
+
+        # Write the frame
+        out.write(frame)
+        frame_count += 1
+
+        if frame_count % 10 == 0:
+            print(f"Processed {frame_count} frames...")
+
+    cap.release()
+    out.release()
+    print("Video processing complete. Result saved as 'results_video.mp4'.")
+
+
+def main(args):
+    """Main function"""
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if 'HGNetv2' in cfg.yaml_cfg:
+        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('Only support resume to load model.state_dict by now.')
+
+    # Load train mode state and convert to deploy mode
+    cfg.model.load_state_dict(state)
+
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    device = args.device
+    model = Model().to(device)
+    img_size = cfg.yaml_cfg["eval_spatial_size"]
+    vit_backbone = cfg.yaml_cfg.get('DINOv3STAs', False)
+
+    # Check if the input file is an image or a video
+    file_path = args.input
+    if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
+        # Process as image
+        process_image(model, device, file_path, img_size, vit_backbone)
+        print("Image processing complete.")
+    else:
+        # Process as video
+        process_video(model, device, file_path, img_size, vit_backbone)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True)
+    parser.add_argument('-r', '--resume', type=str, required=True)
+    parser.add_argument('-i', '--input', type=str, required=True)
+    parser.add_argument('-d', '--device', type=str, default='cpu')
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/inference/torch_inf_vis.py b/tools/inference/torch_inf_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5ef632c84a1e1fe69441d05aa7aaceb5423f38
--- /dev/null
+++ b/tools/inference/torch_inf_vis.py
@@ -0,0 +1,155 @@
+"""
+DEIMv2: Real-Time Object Detection Meets DINOv3
+Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import os
+import random
+import sys
+
+import cv2  # Added for video processing
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image, ImageDraw, ImageFont
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from engine.core import YAMLConfig
+
+label_map = {
+    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane',
+    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight',
+    11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter',
+    15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse',
+    20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',
+    25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe',
+    30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
+    35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat',
+    40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket',
+    44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork',
+    49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple',
+    54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog',
+    59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa',
+    64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window',
+    69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop',
+    74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave',
+    79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender',
+    84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear',
+    89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush'
+}
+
+
+COLORS = plt.cm.tab20.colors  
+COLOR_MAP = {label: tuple([int(c * 255) for c in COLORS[i % len(COLORS)]]) for i, label in enumerate(label_map)}
+
+
+
+def draw(image, labels, boxes, scores, thrh=0.45):
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default() 
+    labels, boxes, scores = labels[scores > thrh], boxes[scores > thrh], scores[scores > thrh]
+
+    for j, box in enumerate(boxes):
+        category = labels[j].item()
+        color = COLOR_MAP.get(category, (255, 255, 255))  
+        box = list(map(int, box))
+
+        
+        draw.rectangle(box, outline=color, width=3)
+
+        text = f"{label_map[category]} {scores[j].item():.2f}"
+        text_bbox = draw.textbbox((0, 0), text, font=font)  
+        text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
+        
+        text_background = [box[0], box[1] - text_height - 2, box[0] + text_width + 4, box[1]]
+        draw.rectangle(text_background, fill=color)
+       
+        draw.text((box[0] + 2, box[1] - text_height - 2), text, fill="black", font=font)
+
+    return image
+
+
+def process_dataset(model, dataset_path, output_path, thrh=0.5, size=(640, 640), vit_backbone=False):
+    os.makedirs(output_path, exist_ok=True)
+    image_paths = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith(('.jpg', '.png'))]
+
+    transforms = T.Compose([
+        T.Resize(size),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                if vit_backbone else T.Lambda(lambda x: x)
+    ])
+
+    print(f"Found {len(image_paths)} images in validation set...")
+    for idx, file_path in enumerate(image_paths):
+        im_pil = Image.open(file_path).convert('RGB')
+        w, h = im_pil.size
+        orig_size = torch.tensor([[w, h]]).cuda()
+
+        # 图像预处理
+        im_data = transforms(im_pil).unsqueeze(0).cuda()
+        output = model(im_data, orig_size)
+        labels, boxes, scores = output[0]['labels'], output[0]['boxes'], output[0]['scores']
+
+        # 绘制结果
+        vis_image = draw(im_pil.copy(), labels, boxes, scores, thrh)
+        save_path = os.path.join(output_path, f"vis_{os.path.basename(file_path)}")
+        vis_image.save(save_path)
+
+        if idx % 500 == 0:
+            print(f"Processed {idx}/{len(image_paths)} images...")
+
+    print("Visualization complete. Results saved in:", output_path)
+
+
+def main(args):
+    """Main function"""
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if 'HGNetv2' in cfg.yaml_cfg:
+        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('Only support resume to load model.state_dict by now.')
+
+    # Load train mode state and convert to deploy mode
+    cfg.model.load_state_dict(state)
+
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = cfg.model.eval().cuda()
+            self.postprocessor = cfg.postprocessor.eval().cuda()
+
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    model = Model()
+    img_size = cfg.yaml_cfg["eval_spatial_size"]
+    vit_backbone = cfg.yaml_cfg.get('DINOv3STAs', False)
+
+    process_dataset(model, args.dataset, args.output, thrh=0.45, size=img_size, vit_backbone=vit_backbone)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True)
+    parser.add_argument('-r', '--resume', type=str, required=True)
+    parser.add_argument('-d', '--dataset', type=str, default='./data/fiftyone/validation/data')
+    parser.add_argument('-o', '--output', type=str, required=True, help="Path to save visualized results")
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/inference/trt_inf.py b/tools/inference/trt_inf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98e0560b14f885df3ffd9ce86720227332fdb8c
--- /dev/null
+++ b/tools/inference/trt_inf.py
@@ -0,0 +1,242 @@
+"""
+DEIMv2: Real-Time Object Detection Meets DINOv3
+Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import collections
+import contextlib
+import os
+import time
+from collections import OrderedDict
+
+import cv2  # Added for video processing
+import numpy as np
+import tensorrt as trt
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageDraw
+
+
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self):
+        self.total = 0
+
+    def __enter__(self):
+        self.start = self.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+
+    def reset(self):
+        self.total = 0
+
+    def time(self):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.time()
+
+class TRTInference(object):
+    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.backend = backend
+        self.max_batch_size = max_batch_size
+
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
+
+        self.engine = self.load_engine(engine_path)
+        self.context = self.engine.create_execution_context()
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+        self.time_profile = TimeProfiler()
+
+    def load_engine(self, path):
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+
+    def get_input_names(self):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+
+    def get_output_names(self):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+
+    def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict:
+        Binding = collections.namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+
+            if shape[0] == -1:
+                shape[0] = max_batch_size
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    context.set_input_shape(name, shape)
+
+            data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+            bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+
+        return bindings
+
+    def run_torch(self, blob):
+        for n in self.input_names:
+            if self.bindings[n].shape != blob[n].shape:
+                self.context.set_input_shape(n, blob[n].shape)
+                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
+
+            assert self.bindings[n].data.dtype == blob[n].dtype, '{} dtype mismatch'.format(n)
+
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+
+        return outputs
+
+    def __call__(self, blob):
+        if self.backend == 'torch':
+            return self.run_torch(blob)
+        else:
+            raise NotImplementedError("Only 'torch' backend is implemented.")
+
+    def synchronize(self):
+        if self.backend == 'torch' and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+def draw(images, labels, boxes, scores, thrh=0.4):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scrs = scr[scr > thrh]
+
+        for j, b in enumerate(box):
+            draw.rectangle(list(b), outline='red')
+            draw.text(
+                (b[0], b[1]),
+                text=f"{lab[j].item()} {round(scrs[j].item(), 2)}",
+                fill='blue',
+            )
+
+    return images
+
+def process_image(m, file_path, device, size=(640, 640), model_size='s'):
+    im_pil = Image.open(file_path).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None].to(device)
+
+    transforms = T.Compose([
+        T.Resize(size),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                if model_size not in ['atto', 'femto', 'pico', 'n'] 
+                else T.Lambda(lambda x: x)
+    ])
+    im_data = transforms(im_pil)[None]
+
+    blob = {
+        'images': im_data.to(device),
+        'orig_target_sizes': orig_size.to(device),
+    }
+
+    output = m(blob)
+    result_images = draw([im_pil], output['labels'], output['boxes'], output['scores'])
+    result_images[0].save('trt_result.jpg')
+    print("Image processing complete. Result saved as 'result.jpg'.")
+
+def process_video(m, file_path, device, size=(640, 640), model_size='s'):
+    cap = cv2.VideoCapture(file_path)
+
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    # Define the codec and create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter('trt_result.mp4', fourcc, fps, (orig_w, orig_h))
+
+    transforms = T.Compose([
+        T.Resize(size),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
+                    if model_size not in ['atto', 'femto', 'pico', 'n'] 
+                    else T.Lambda(lambda x: x)
+    ])
+
+    frame_count = 0
+    print("Processing video frames...")
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        # Convert frame to PIL image
+        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+        w, h = frame_pil.size
+        orig_size = torch.tensor([w, h])[None].to(device)
+
+        im_data = transforms(frame_pil)[None]
+
+        blob = {
+            'images': im_data.to(device),
+            'orig_target_sizes': orig_size.to(device),
+        }
+
+        output = m(blob)
+
+        # Draw detections on the frame
+        result_images = draw([frame_pil], output['labels'], output['boxes'], output['scores'])
+
+        # Convert back to OpenCV image
+        frame = cv2.cvtColor(np.array(result_images[0]), cv2.COLOR_RGB2BGR)
+
+        # Write the frame
+        out.write(frame)
+        frame_count += 1
+
+        if frame_count % 10 == 0:
+            print(f"Processed {frame_count} frames...")
+
+    cap.release()
+    out.release()
+    print("Video processing complete. Result saved as 'result_video.mp4'.")
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-trt', '--trt', type=str, required=True)
+    parser.add_argument('-i', '--input', type=str, required=True)
+    parser.add_argument('-d', '--device', type=str, default='cuda:0')
+    parser.add_argument('-s', '--size', type=int, required=True, help='input size, e.g., 640')
+    parser.add_argument('-ms', '--model-size', type=str, required=True, choices=['atto', 'femto', 'pico', 'n', 's', 'm', 'l', 'x'])
+
+
+    args = parser.parse_args()
+
+    m = TRTInference(args.trt, device=args.device)
+    size = (args.size,) * 2
+
+    file_path = args.input
+    if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
+        # Process as image
+        process_image(m, file_path, args.device, size, args.model_size)
+    else:
+        # Process as video
+        process_video(m, file_path, args.device, size, args.model_size)
diff --git a/tools/reference/convert_weight.py b/tools/reference/convert_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..9651d19a98b181658400137a74bcaf39be088567
--- /dev/null
+++ b/tools/reference/convert_weight.py
@@ -0,0 +1,29 @@
+import torch
+import os
+import argparse
+
+def save_only_ema_weights(checkpoint_file):
+    """Extract and save only the EMA weights."""
+    checkpoint = torch.load(checkpoint_file, map_location='cpu')
+
+    weights = {}
+    if 'ema' in checkpoint:
+        weights['model'] = checkpoint['ema']['module']
+    else:
+        raise ValueError("The checkpoint does not contain 'ema'.")
+
+    dir_name, base_name = os.path.split(checkpoint_file)
+    name, ext = os.path.splitext(base_name)
+    output_file = os.path.join(dir_name, f"{name}_converted{ext}")
+
+    torch.save(weights, output_file)
+    print(f"EMA weights saved to {output_file}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Extract and save only EMA weights.")
+    parser.add_argument('checkpoint_dir', type=str, help="Path to the input checkpoint file.")
+
+    args = parser.parse_args()
+    for file in os.listdir(args.checkpoint_dir):
+        if '.pth' in file and '_converted' not in file:
+            save_only_ema_weights(os.path.join(args.checkpoint_dir, file))
diff --git a/tools/reference/safe_training.sh b/tools/reference/safe_training.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d3c752a48f27511a353d65dbb9e8f97146ad4817
--- /dev/null
+++ b/tools/reference/safe_training.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Function to display the menu for selecting model size
+select_model_size() {
+    echo "Select model size:"
+    select size in s m l x; do
+        case $size in
+            s|m|l|x)
+                echo "You selected model size: $size"
+                MODEL_SIZE=$size
+                break
+                ;;
+            *)
+                echo "Invalid selection. Please try again."
+                    ;;
+        esac
+    done
+}
+
+# Function to display the menu for selecting task
+select_task() {
+    echo "Select task:"
+    select task in obj365 obj2coco coco; do
+        case $task in
+            obj365|obj2coco|coco)
+                echo "You selected task: $task"
+                TASK=$task
+                break
+                ;;
+            *)
+                echo "Invalid selection. Please try again."
+                ;;
+        esac
+    done
+}
+
+# Function to ask if the user wants to save logs to a txt file
+ask_save_logs() {
+    while true; do
+        read -p "Do you want to save logs to a txt file? (y/n): " yn
+        case $yn in
+            [Yy]* )
+                SAVE_LOGS=true
+                break
+                ;;
+            [Nn]* )
+                SAVE_LOGS=false
+                break
+                ;;
+            * ) echo "Please answer yes or no.";;
+        esac
+    done
+}
+
+# Call the functions to let the user select
+select_model_size
+select_task
+ask_save_logs
+
+# Set config file and output directory based on selection
+if [ "$TASK" = "coco" ]; then
+    CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
+else
+    CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
+fi
+
+OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}"
+
+# Construct the training command
+TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR"
+
+# Append log redirection if SAVE_LOGS is true
+if [ "$SAVE_LOGS" = true ]; then
+    LOG_FILE="${MODEL_SIZE}_${TASK}.txt"
+    TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &"
+else
+    TRAIN_CMD="$TRAIN_CMD &"
+fi
+
+# Run the training command
+eval $TRAIN_CMD
+if [ $? -ne 0 ]; then
+    echo "First training failed, restarting with resume option..."
+    while true; do
+        RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth"
+        if [ "$SAVE_LOGS" = true ]; then
+            LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt"
+            RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &"
+        else
+            RESUME_CMD="$RESUME_CMD &"
+        fi
+        eval $RESUME_CMD
+        if [ $? -eq 0 ]; then
+            break
+        fi
+    done
+fi
diff --git a/tools/visualization/fiftyone_vis.py b/tools/visualization/fiftyone_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..5831293b16c8c77209e97411bee00695332b24e2
--- /dev/null
+++ b/tools/visualization/fiftyone_vis.py
@@ -0,0 +1,307 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+import fiftyone as fo
+import fiftyone.core.fields as fof
+import fiftyone.core.labels as fol
+import fiftyone.core.models as fom
+import fiftyone.zoo as foz
+import torch
+import torchvision.transforms as transforms
+import tqdm
+from fiftyone import ViewField as F
+from PIL import Image
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
+from engine.core import YAMLConfig
+
+
+def kill_existing_mongod():
+    try:
+        result = subprocess.run(['ps', 'aux'], stdout=subprocess.PIPE)
+        processes = result.stdout.decode('utf-8').splitlines()
+
+        for process in processes:
+            if 'mongod' in process and '--dbpath' in process:
+                # find mongod PID
+                pid = int(process.split()[1])
+                print(f"Killing existing mongod process with PID: {pid}")
+                # kill mongod session
+                os.kill(pid, 9)
+    except Exception as e:
+        print(f"Error occurred while killing mongod: {e}")
+
+kill_existing_mongod()
+
+
+label_map = {
+    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane',
+    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight',
+    11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter',
+    15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse',
+    20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',
+    25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe',
+    30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
+    35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat',
+    40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket',
+    44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork',
+    49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple',
+    54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog',
+    59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa',
+    64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window',
+    69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop',
+    74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave',
+    79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender',
+    84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear',
+    89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush'
+}
+
+class CustomModel(fom.Model):
+    def __init__(self, cfg):
+        super().__init__()
+        self.model = cfg.model.eval().cuda()
+        self.postprocessor = cfg.postprocessor.eval().cuda()
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Resize((640, 640)),  # Resize to the size expected by your model
+            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+    @property
+    def media_type(self):
+        return "image"
+
+    @property
+    def has_logits(self):
+        return False
+
+    @property
+    def has_embeddings(self):
+        return False
+
+    @property
+    def ragged_batches(self):
+        return False
+
+    @property
+    def transforms(self):
+        return None
+
+    @property
+    def preprocess(self):
+        return True
+
+    @preprocess.setter
+    def preprocess(self, value):
+        pass
+
+    def _convert_predictions(self, predictions):
+        class_labels, bboxes, scores = predictions[0]['labels'], predictions[0]['boxes'], predictions[0]['scores']
+
+        detections = []
+        for label, bbox, score in zip(class_labels, bboxes, scores):
+            detection = fol.Detection(
+                label=label_map[label.item()],
+                bounding_box=[
+                    bbox[0] / 640,  # Normalized coordinates
+                    bbox[1] / 640,
+                    (bbox[2] - bbox[0]) / 640,
+                    (bbox[3] - bbox[1]) / 640
+                ],
+                confidence=score
+            )
+            detections.append(detection)
+
+        return fol.Detections(detections=detections)
+
+    def predict(self, image):
+        image = Image.fromarray(image).convert('RGB')
+        image_tensor = self.transform(image).unsqueeze(0).cuda()
+        outputs = self.model(image_tensor)
+        orig_target_sizes = torch.tensor([[640, 640]]).cuda()
+        predictions = self.postprocessor(outputs, orig_target_sizes)
+        return self._convert_predictions(predictions)
+
+    def predict_all(self, images):
+        image_tensors = []
+        for image in images:
+            image = Image.fromarray(image)
+            image_tensor = self.transform(image)
+            image_tensors.append(image_tensor)
+        image_tensors = torch.stack(image_tensors).cuda()
+        outputs = self.model(image_tensors)
+        orig_target_sizes = torch.tensor([[640, 640] for image in images]).cuda()
+        predictions = self.postprocessor(outputs, orig_target_sizes)
+        converted_predictions = [self._convert_predictions(pred) for pred in predictions]
+
+        # Ensure the output is a list of lists of Detections
+        return converted_predictions
+
+def filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3):
+    for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)):
+        has_modified = False
+        for i, detection in enumerate(sample["predictions0"].detections):
+
+            if "original_confidence" not in detection:
+                detection["original_confidence"] = detection["confidence"]
+
+            if (detection["confidence"] <= confidence_threshold and sample["predictions5"].detections[i]["confidence"] >= confidence_threshold) or \
+               (detection["confidence"] >= confidence_threshold and sample["predictions5"].detections[i]["confidence"] <= confidence_threshold):
+
+                sample["predictions0"].detections[i]["confidence"] = sample["predictions5"].detections[i]["confidence"]
+                has_modified = True
+        if has_modified:
+            sample.save()
+
+
+def restore_confidence(predictions_view):
+    for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)):
+        for i, detection in enumerate(sample["predictions0"].detections):
+            if "original_confidence" in detection:
+                detection["confidence"] = detection["original_confidence"]
+        sample.save()
+
+def fast_iou(bbox1, bbox2):
+    x1, y1, w1, h1 = bbox1
+    x2, y2, w2, h2 = bbox2
+    xA = max(x1, x2)
+    yA = max(y1, y2)
+    xB = min(x1 + w1, x2 + w2)
+    yB = min(y1 + h1, y2 + h2)
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    boxAArea = w1 * h1
+    boxBArea = w2 * h2
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou
+
+def assign_iou_diff(predictions_view):
+    for sample in predictions_view:
+        ious_0 = [detection.eval0_iou if 'eval0_iou' in detection else None for detection in sample["predictions0"].detections]
+        ious_5 = [detection.eval5_iou if 'eval5_iou' in detection else None for detection in sample["predictions5"].detections]
+        bbox_0 = [detection.bounding_box for detection in sample["predictions0"].detections]
+        bbox_5 = [detection.bounding_box for detection in sample["predictions5"].detections]
+        # iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None else -1 for iou_0, iou_5 in zip(ious_0, ious_5)]
+        iou_inter = [fast_iou(b0, b5) for b0, b5 in zip(bbox_0, bbox_5)]
+        iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None and iou_inter > 0.5 else -1 for iou_0, iou_5, iou_inter in zip(ious_0, ious_5, iou_inter)]
+
+        for detection, iou_diff in zip(sample["predictions0"].detections, iou_diffs):
+            detection["iou_diff"] = iou_diff
+        for detection, iou_diff in zip(sample["predictions5"].detections, iou_diffs):
+            detection["iou_diff"] = iou_diff
+        # for detection, iou_diff in zip(sample["predictions100"].detections, iou_diffs):
+        #     detection["iou_diff"] = iou_diff
+        sample.save()
+
+def main(args):
+    try:
+        if os.path.exists("saved_predictions_view") and os.path.exists("saved_filtered_view"):
+            print("Loading saved predictions and filtered views...")
+            dataset = foz.load_zoo_dataset(
+                "coco-2017",
+                split="validation",
+                dataset_name="evaluate-detections-tutorial",
+                dataset_dir="data/fiftyone"
+            )
+
+            dataset.persistent = True
+            session = fo.launch_app(dataset, port=args.port)
+
+            predictions_view = fo.Dataset.from_dir(
+                dataset_dir="saved_predictions_view",
+                dataset_type=fo.types.FiftyOneDataset
+            ).view()
+            filtered_view = fo.Dataset.from_dir(
+                dataset_dir="saved_filtered_view",
+                dataset_type=fo.types.FiftyOneDataset
+            ).view()
+        else:
+            dataset = foz.load_zoo_dataset(
+                "coco-2017",
+                split="validation",
+                dataset_name="evaluate-detections-tutorial",
+                dataset_dir="data/fiftyone"
+            )
+
+            dataset.persistent = True
+
+            session = fo.launch_app(dataset, port=args.port)
+            cfg = YAMLConfig(args.config, resume=args.resume)
+            if 'HGNetv2' in cfg.yaml_cfg:
+                cfg.yaml_cfg['HGNetv2']['pretrained'] = False
+            if args.resume:
+                checkpoint = torch.load(args.resume, map_location='cpu')
+                if 'ema' in checkpoint:
+                    state = checkpoint['ema']['module']
+                else:
+                    state = checkpoint['model']
+            else:
+                raise AttributeError('only support resume to load model.state_dict by now.')
+
+            # NOTE load train mode state -> convert to deploy mode
+            cfg.model.load_state_dict(state)
+            predictions_view = dataset.take(500, seed=51)
+
+            model = CustomModel(cfg)
+            L = model.model.decoder.decoder.eval_idx
+            # Apply models and save predictions in different label fields
+            for i in [L]:
+                model.model.decoder.decoder.eval_idx = i
+                label_field = "predictions{:d}".format(i)
+                predictions_view.apply_model(model, label_field=label_field)
+
+            # filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3)
+            for i in [L]:
+                label_field = "predictions{:d}".format(i)
+                predictions_view = predictions_view.filter_labels(label_field, F("confidence") > 0.5, only_matches=False)
+                eval_key = "eval{:d}".format(i)
+                _ = predictions_view.evaluate_detections(
+                    label_field,
+                    gt_field="ground_truth",
+                    eval_key=eval_key,
+                    compute_mAP=True,
+                )
+
+            # assign_iou_diff(predictions_view)
+
+            # filtered_view = predictions_view.filter_labels("predictions0", F("iou_diff") > 0.05, only_matches=True)
+            # filtered_view = filtered_view.filter_labels("predictions5", F("iou_diff") > 0.05, only_matches=True)
+            # restore_confidence(filtered_view)
+
+            predictions_view.export(
+                export_dir="saved_predictions_view",
+                dataset_type=fo.types.FiftyOneDataset
+            )
+            # filtered_view.export(
+            #     export_dir="saved_filtered_view",
+            #     dataset_type=fo.types.FiftyOneDataset
+            # )
+
+        # Display the filtered view
+        session.view = predictions_view
+
+        # Keep the session open
+        while True:
+            time.sleep(1)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        print("Shutting down session")
+        if 'session' in locals():
+            session.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', type=str)
+    parser.add_argument('--resume', '-r', type=str)
+    parser.add_argument('--port', '-p', type=int)
+    args = parser.parse_args()
+
+    main(args)