diff --git a/configs/base/dataloader.yml b/configs/base/dataloader.yml new file mode 100644 index 0000000000000000000000000000000000000000..22de3aa4645e9620c0297396da56c06c8b47e8a4 --- /dev/null +++ b/configs/base/dataloader.yml @@ -0,0 +1,39 @@ + +train_dataloader: + dataset: + transforms: + ops: + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + name: stop_epoch + epoch: 72 # epoch in [71, ~) stop `ops` + ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] # Mosaicを除外 + + collate_fn: + type: BatchImageCollateFunction + base_size: 640 + base_size_repeat: 3 + stop_epoch: 72 # epoch in [72, ~) stop `multiscales` + + shuffle: True + total_batch_size: 32 # total batch size equals to 32 (4 * 8) + num_workers: 4 + + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + total_batch_size: 64 + num_workers: 4 diff --git a/configs/base/deim.yml b/configs/base/deim.yml new file mode 100644 index 0000000000000000000000000000000000000000..3aa63588df77c2063c00632a90dfd45dbdee0ef5 --- /dev/null +++ b/configs/base/deim.yml @@ -0,0 +1,48 @@ +# Dense O2O +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 29, 50] # list + ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + mosaic_prob: 0.5 + + collate_fn: + mixup_prob: 0.5 + mixup_epochs: [4, 29] + stop_epoch: 50 # epoch in [72, ~) stop `multiscales` + +# Unfreezing BN +HGNetv2: + freeze_at: -1 # 0 default + freeze_norm: False # True default + +# Activation +DFINETransformer: + activation: silu + mlp_act: silu + +## Our LR-Scheduler +lrsheduler: flatcosine +lr_gamma: 0.5 +warmup_iter: 2000 +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +## Our Loss +DEIMCriterion: + weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} + losses: ['mal', 'boxes', 'local'] + gamma: 1.5 \ No newline at end of file diff --git a/configs/base/deimv2.yml b/configs/base/deimv2.yml new file mode 100644 index 0000000000000000000000000000000000000000..7428898f8f734a7cb48cadcd4c7a9bd6f719d9a2 --- /dev/null +++ b/configs/base/deimv2.yml @@ -0,0 +1,144 @@ +task: detection + +model: DEIM +criterion: DEIMCriterion +postprocessor: PostProcessor + +use_focal_loss: True +eval_spatial_size: [640, 640] # h w +checkpoint_freq: 5 # save freq + +DEIM: + backbone: HGNetv2 + encoder: HybridEncoder + decoder: DEIMTransformer + +HGNetv2: + name: 'B4' + return_idx: [1, 2, 3] + freeze_at: -1 # 0 default + freeze_stem_only: True + freeze_norm: False # True default + pretrained: True + local_model_dir: ./weight/hgnetv2/ + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + # New + version: deim + csp_type: csp2 + fuse_op: sum + +DEIMTransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_layers: 6 + eval_idx: -1 + num_queries: 300 + + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 + + reg_max: 32 + reg_scale: 4 + layer_scale: 1 # 2 + + num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3] + cross_attn_method: default # default, discrete + query_select_method: default # default, agnostic + + # Act + activation: silu + mlp_act: silu + + # FFN + dim_feedforward: 2048 + +PostProcessor: + num_top_queries: 300 + + +## DEIM LR-Scheduler +epoches: 58 # 72 + 2n # Increase to search for the optimal ema + +lrsheduler: flatcosine +lr_gamma: 0.5 +warmup_iter: 2000 +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +## Dense O2O: Mosaic + Mixup + CopyBlend +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + # Mosaic options + policy: + epoch: [4, 29, 50] # list + ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + mosaic_prob: 0.5 + + collate_fn: + # Mixup options + mixup_prob: 0.5 + mixup_epochs: [4, 29] + stop_epoch: 50 # epoch in [72, ~) stop `multiscales` + # CopyBlend options + copyblend_prob: 0.5 + copyblend_epochs: [4, 50] + area_threshold: 100 + num_objects: 3 + with_expand: True + expand_ratios: [0.1, 0.25] + + ema_restart_decay: 0.9999 + base_size_repeat: 4 + +## DEIM Loss +DEIMCriterion: + weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} + losses: ['mal', 'boxes', 'local'] + gamma: 1.5 + alpha: 0.75 + reg_max: 32 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + alpha: 0.25 + gamma: 2.0 + # change matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 45 \ No newline at end of file diff --git a/configs/base/dfine_hgnetv2.yml b/configs/base/dfine_hgnetv2.yml new file mode 100644 index 0000000000000000000000000000000000000000..e9de6d1b10341f1e5f3d525213aacb48a19e5aaf --- /dev/null +++ b/configs/base/dfine_hgnetv2.yml @@ -0,0 +1,90 @@ +task: detection + +model: DEIM +criterion: DEIMCriterion +postprocessor: PostProcessor + +use_focal_loss: True +eval_spatial_size: [640, 640] # h w +checkpoint_freq: 4 # save freq + +DEIM: + backbone: HGNetv2 + encoder: HybridEncoder + decoder: DFINETransformer + +# Add, default for step lr scheduler +lrsheduler: flatcosine +lr_gamma: 1 +warmup_iter: 500 +flat_epoch: 4000000 +no_aug_epoch: 0 + +HGNetv2: + pretrained: True + local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/ + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + +DFINETransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_layers: 6 + eval_idx: -1 + num_queries: 300 + + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 + + # NEW + reg_max: 32 + reg_scale: 4 + + # Auxiliary decoder layers dimension scaling + # "eg. If num_layers: 6 eval_idx: -4, + # then layer 3, 4, 5 are auxiliary decoder layers." + layer_scale: 1 # 2 + + + num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3] + cross_attn_method: default # default, discrete + query_select_method: default # default, agnostic + + +PostProcessor: + num_top_queries: 300 + + +DEIMCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} + losses: ['vfl', 'boxes', 'local'] + alpha: 0.75 + gamma: 2.0 + reg_max: 32 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + alpha: 0.25 + gamma: 2.0 \ No newline at end of file diff --git a/configs/base/optimizer.yml b/configs/base/optimizer.yml new file mode 100644 index 0000000000000000000000000000000000000000..db490088f0220b72309f1d7a4ab1ca6aafb45322 --- /dev/null +++ b/configs/base/optimizer.yml @@ -0,0 +1,35 @@ +use_amp: True +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 1000 + start: 0 + +epoches: 72 +clip_max_norm: 0.1 + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.0000125 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + +lr_scheduler: + type: MultiStepLR + milestones: [500] + gamma: 0.1 + +lr_warmup_scheduler: + type: LinearWarmup + warmup_duration: 500 diff --git a/configs/base/rt_deim.yml b/configs/base/rt_deim.yml new file mode 100644 index 0000000000000000000000000000000000000000..d195ce9ea09165289e4c72301ea14b7ac45971ce --- /dev/null +++ b/configs/base/rt_deim.yml @@ -0,0 +1,49 @@ +# Dense O2O +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 29, 50] # list + ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + mosaic_prob: 0.5 + + collate_fn: + mixup_prob: 0.5 + mixup_epochs: [4, 29] + stop_epoch: 50 # epoch in [72, ~) stop `multiscales` + +# Unfreezing BN +PResNet: + freeze_at: -1 # default 0 + freeze_norm: False # default True + +# Activation +RTDETRTransformerv2: + query_pos_method: as_reg + activation: silu + mlp_act: silu + +## Our LR-Scheduler +lrsheduler: flatcosine +lr_gamma: 0.5 +warmup_iter: 2000 +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +## Our Loss +DEIMCriterion: + weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2} + losses: ['mal', 'boxes', ] + gamma: 1.5 \ No newline at end of file diff --git a/configs/base/rt_optimizer.yml b/configs/base/rt_optimizer.yml new file mode 100644 index 0000000000000000000000000000000000000000..0dbada062e6325083f3e79991b1965d2c0fd7901 --- /dev/null +++ b/configs/base/rt_optimizer.yml @@ -0,0 +1,37 @@ +use_amp: True +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + start: 0 + +epoches: 72 +clip_max_norm: 0.1 + +train_dataloader: + total_batch_size: 16 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +lr_scheduler: + type: MultiStepLR + milestones: [1000] + gamma: 0.1 + + +lr_warmup_scheduler: + type: LinearWarmup + warmup_duration: 2000 diff --git a/configs/base/rtdetrv2_r50vd.yml b/configs/base/rtdetrv2_r50vd.yml new file mode 100644 index 0000000000000000000000000000000000000000..e00de349bc6ef8d1a09a19634dc6e9d0c6b4ac41 --- /dev/null +++ b/configs/base/rtdetrv2_r50vd.yml @@ -0,0 +1,90 @@ +task: detection + +model: DEIM +criterion: DEIMCriterion +postprocessor: PostProcessor + +use_focal_loss: True +eval_spatial_size: [640, 640] # h w +checkpoint_freq: 4 # save freq + +DEIM: + backbone: PResNet + encoder: HybridEncoder + decoder: RTDETRTransformerv2 + + +# Add, default for step lr scheduler +lrsheduler: flatcosine +lr_gamma: 1 +warmup_iter: 2000 +flat_epoch: 4000000 +no_aug_epoch: 0 + +PResNet: + depth: 50 + variant: d + freeze_at: 0 + return_idx: [1, 2, 3] + num_stages: 4 + freeze_norm: True + pretrained: True + local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/ + + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + version: rt_detrv2 # pay attention to this + + +RTDETRTransformerv2: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_layers: 6 + num_queries: 300 + + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 # 1.0 0.4 + + eval_idx: -1 + + # NEW, can be chosen + num_points: [4, 4, 4] # [3,3,3] [2,2,2] + cross_attn_method: default # default, discrete + query_select_method: default # default, agnostic + + +PostProcessor: + num_top_queries: 300 + +DEIMCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + use_uni_set: False + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + alpha: 0.25 + gamma: 2.0 \ No newline at end of file diff --git a/configs/coco_detection.yml b/configs/dataset/coco_detection.yml similarity index 100% rename from configs/coco_detection.yml rename to configs/dataset/coco_detection.yml diff --git a/configs/dataset/crowdhuman_detection.yml b/configs/dataset/crowdhuman_detection.yml new file mode 100644 index 0000000000000000000000000000000000000000..f4dc707db67f651533671034dea92144083f74f9 --- /dev/null +++ b/configs/dataset/crowdhuman_detection.yml @@ -0,0 +1,41 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +num_classes: 2 # your dataset classes +remap_mscoco_category: False + +train_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train + ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val + ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/configs/dataset/custom_detection.yml b/configs/dataset/custom_detection.yml new file mode 100644 index 0000000000000000000000000000000000000000..35435ad68e29d99d8f9f69100cd56a2c403fe710 --- /dev/null +++ b/configs/dataset/custom_detection.yml @@ -0,0 +1,41 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +num_classes: 777 # your dataset classes +remap_mscoco_category: False + +train_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /data/yourdataset/train + ann_file: /data/yourdataset/train/train.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /data/yourdataset/val + ann_file: /data/yourdataset/val/val.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/configs/dataset/obj365_detection.yml b/configs/dataset/obj365_detection.yml new file mode 100644 index 0000000000000000000000000000000000000000..e843e85bf2d53de3e61fdd109cf51ab9fc9957e3 --- /dev/null +++ b/configs/dataset/obj365_detection.yml @@ -0,0 +1,41 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +num_classes: 366 +remap_mscoco_category: False + +train_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /home/Dataset/objects365/train + ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: /home/Dataset/objects365/val + ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/configs/dataset/voc_detection.yml b/configs/dataset/voc_detection.yml new file mode 100644 index 0000000000000000000000000000000000000000..1f9ceeb8881653d496ac5fd02c465aea5306d72f --- /dev/null +++ b/configs/dataset/voc_detection.yml @@ -0,0 +1,40 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +num_classes: 20 + +train_dataloader: + type: DataLoader + dataset: + type: VOCDetection + root: ./dataset/voc/ + ann_file: trainval.txt + label_file: label_list.txt + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: VOCDetection + root: ./dataset/voc/ + ann_file: test.txt + label_file: label_list.txt + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/configs/deim_dfine/deim_hgnetv2_l_coco.yml b/configs/deim_dfine/deim_hgnetv2_l_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..6b35a78e453d52d15291fd91dd04c9a55cfec8af --- /dev/null +++ b/configs/deim_dfine/deim_hgnetv2_l_coco.yml @@ -0,0 +1,37 @@ +__include__: [ + './dfine_hgnetv2_l_coco.yml', + '../base/deim.yml' +] + +output_dir: ./outputs/deim_hgnetv2_l_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000025 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 + +# Increase to search for the optimal ema +epoches: 58 # 72 + 2n + +## Our LR-Scheduler +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 29, 50] # list + + collate_fn: + mixup_epochs: [4, 29] + stop_epoch: 50 \ No newline at end of file diff --git a/configs/deim_dfine/deim_hgnetv2_m_coco.yml b/configs/deim_dfine/deim_hgnetv2_m_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..9fa5167620c57f6fdb14892dd1cf9a00839fde92 --- /dev/null +++ b/configs/deim_dfine/deim_hgnetv2_m_coco.yml @@ -0,0 +1,39 @@ +__include__: [ + './dfine_hgnetv2_m_coco.yml', + '../base/deim.yml' +] + +output_dir: ./outputs/deim_hgnetv2_m_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*bn).*$' + lr: 0.00004 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0004 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# Increase to search for the optimal ema +epoches: 102 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 49, 90] # list + + collate_fn: + mixup_epochs: [4, 49] + stop_epoch: 90 \ No newline at end of file diff --git a/configs/deim_dfine/deim_hgnetv2_n_coco.yml b/configs/deim_dfine/deim_hgnetv2_n_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..62db245d96e4cb402e774efe9cb26fc1f401e40e --- /dev/null +++ b/configs/deim_dfine/deim_hgnetv2_n_coco.yml @@ -0,0 +1,44 @@ +__include__: [ + './dfine_hgnetv2_n_coco.yml', + '../base/deim.yml' +] + +output_dir: ./deim_outputs/deim_hgnetv2_n_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0004 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0004 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0008 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# Increase to search for the optimal ema +epoches: 160 # 148 + 12 + +## Our LR-Scheduler +flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 +lr_gamma: 1.0 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 78, 148] # list + + collate_fn: + mixup_epochs: [4, 78] + stop_epoch: 148 + base_size_repeat: ~ \ No newline at end of file diff --git a/configs/deim_dfine/deim_hgnetv2_s_coco.yml b/configs/deim_dfine/deim_hgnetv2_s_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..68ea99aae9926cefb33b3380d37a9cbd70ed28eb --- /dev/null +++ b/configs/deim_dfine/deim_hgnetv2_s_coco.yml @@ -0,0 +1,39 @@ +__include__: [ + './dfine_hgnetv2_s_coco.yml', + '../base/deim.yml' +] + +output_dir: ./outputs/deim_hgnetv2_s_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*bn).*$' + lr: 0.0002 + - + params: '^(?=.*(?:norm|bn)).*$' # except bias + weight_decay: 0. + + lr: 0.0004 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# Increase to search for the optimal ema +epoches: 132 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 64, 120] # list + + collate_fn: + mixup_epochs: [4, 64] + stop_epoch: 120 \ No newline at end of file diff --git a/configs/deim_dfine/deim_hgnetv2_x_coco.yml b/configs/deim_dfine/deim_hgnetv2_x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..8ec7f1b611089c6b8aa3a974c980f862cf821679 --- /dev/null +++ b/configs/deim_dfine/deim_hgnetv2_x_coco.yml @@ -0,0 +1,37 @@ +__include__: [ + './dfine_hgnetv2_x_coco.yml', + '../base/deim.yml' +] + +output_dir: ./outputs/deim_hgnetv2_x_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000005 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 + +# Increase to search for the optimal ema +epoches: 58 # 72 + 2n + +## Our LR-Scheduler +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 29, 50] # list + + collate_fn: + mixup_epochs: [4, 29] + stop_epoch: 50 \ No newline at end of file diff --git a/configs/deim_dfine/dfine_hgnetv2_l_coco.yml b/configs/deim_dfine/dfine_hgnetv2_l_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..16c6002434659b5918db2f08955f054ca9c83d81 --- /dev/null +++ b/configs/deim_dfine/dfine_hgnetv2_l_coco.yml @@ -0,0 +1,44 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/dfine_hgnetv2.yml', +] + +output_dir: ./outputs/dfine_hgnetv2_l_coco + + +HGNetv2: + name: 'B4' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0000125 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + +# Increase to search for the optimal ema +epoches: 80 # 72 + 2n +train_dataloader: + dataset: + transforms: + policy: + epoch: 72 + collate_fn: + stop_epoch: 72 + ema_restart_decay: 0.9999 + base_size_repeat: 4 diff --git a/configs/deim_dfine/dfine_hgnetv2_m_coco.yml b/configs/deim_dfine/dfine_hgnetv2_m_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..6b5f917bd5723b931bb09e0389eb49e2e15af8c8 --- /dev/null +++ b/configs/deim_dfine/dfine_hgnetv2_m_coco.yml @@ -0,0 +1,60 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/dfine_hgnetv2.yml', +] + +output_dir: ./output/dfine_hgnetv2_m_coco + + +DEIM: + backbone: HGNetv2 + +HGNetv2: + name: 'B2' + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + +DFINETransformer: + num_layers: 4 # 5 6 + eval_idx: -1 # -2 -3 + +HybridEncoder: + in_channels: [384, 768, 1536] + hidden_dim: 256 + depth_mult: 0.67 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00002 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.00002 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# Increase to search for the optimal ema +epoches: 132 # 120 + 4n +train_dataloader: + dataset: + transforms: + policy: + epoch: 120 + collate_fn: + stop_epoch: 120 + ema_restart_decay: 0.9999 + base_size_repeat: 6 diff --git a/configs/deim_dfine/dfine_hgnetv2_n_coco.yml b/configs/deim_dfine/dfine_hgnetv2_n_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..c45e65357cbfe547851e8b7385be67ca7226f6cd --- /dev/null +++ b/configs/deim_dfine/dfine_hgnetv2_n_coco.yml @@ -0,0 +1,82 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/dfine_hgnetv2.yml', +] + +output_dir: ./output/dfine_hgnetv2_n_coco + + +DEIM: + backbone: HGNetv2 + +HGNetv2: + name: 'B0' + return_idx: [2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + + +HybridEncoder: + in_channels: [512, 1024] + feat_strides: [16, 32] + + # intra + hidden_dim: 128 + use_encoder_idx: [1] + dim_feedforward: 512 + + # cross + expansion: 0.34 + depth_mult: 0.5 + + +DFINETransformer: + feat_channels: [128, 128] + feat_strides: [16, 32] + hidden_dim: 128 + dim_feedforward: 512 + num_levels: 2 + + num_layers: 3 + eval_idx: -1 + + num_points: [6, 6] + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0004 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0004 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0008 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# Increase to search for the optimal ema +epoches: 160 # 148 + 4n +train_dataloader: + total_batch_size: 128 + dataset: + transforms: + policy: + epoch: 148 + collate_fn: + stop_epoch: 148 + ema_restart_decay: 0.9999 + base_size_repeat: ~ + +val_dataloader: + total_batch_size: 256 diff --git a/configs/deim_dfine/dfine_hgnetv2_s_coco.yml b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..33857bc47fe0311586a28b16f7c2d58af909ffbc --- /dev/null +++ b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml @@ -0,0 +1,61 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/dfine_hgnetv2.yml', +] + +output_dir: ./output/dfine_hgnetv2_s_coco + + +DEIM: + backbone: HGNetv2 + +HGNetv2: + name: 'B0' + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + +DFINETransformer: + num_layers: 3 # 4 5 6 + eval_idx: -1 # -2 -3 -4 + +HybridEncoder: + in_channels: [256, 512, 1024] + hidden_dim: 256 + depth_mult: 0.34 + expansion: 0.5 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0001 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0001 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# Increase to search for the optimal ema +epoches: 132 # 120 + 4n +train_dataloader: + dataset: + transforms: + policy: + epoch: 120 + collate_fn: + stop_epoch: 120 + ema_restart_decay: 0.9999 + base_size_repeat: 20 diff --git a/configs/deim_dfine/dfine_hgnetv2_x_coco.yml b/configs/deim_dfine/dfine_hgnetv2_x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..46ec15753906b37f9d0f61bf8e0637c77c295251 --- /dev/null +++ b/configs/deim_dfine/dfine_hgnetv2_x_coco.yml @@ -0,0 +1,56 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/dfine_hgnetv2.yml', +] + +output_dir: ./output/dfine_hgnetv2_x_coco + + +DEIM: + backbone: HGNetv2 + +HGNetv2: + name: 'B5' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + +DFINETransformer: + feat_channels: [384, 384, 384] + reg_scale: 8 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0000025 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + +# Increase to search for the optimal ema +epoches: 80 # 72 + 2n +train_dataloader: + dataset: + transforms: + policy: + epoch: 72 + collate_fn: + stop_epoch: 72 + ema_restart_decay: 0.9998 + base_size_repeat: 3 diff --git a/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml b/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml new file mode 100644 index 0000000000000000000000000000000000000000..28fcd4c12f2ab91bffb111e33b18b612a9bda12d --- /dev/null +++ b/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml @@ -0,0 +1,50 @@ +__include__: [ + './dfine_hgnetv2_x_obj2coco.yml', + '../../base/deim.yml' +] + +output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e + +HGNetv2: + freeze_at: 0 # 0 default + freeze_norm: True # True default + +# Activation +DFINETransformer: + activation: relu + mlp_act: relu + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0000025 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + +# Increase to search for the optimal ema +epoches: 24 # 72 + 2n + +## Our LR-Scheduler +lrsheduler: flatcosine +lr_gamma: 1 +warmup_iter: 0 # 0 +flat_epoch: 12000 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 4 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [2, 12, 20] # list + + collate_fn: + mixup_epochs: [2, 12] + stop_epoch: 20 \ No newline at end of file diff --git a/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml b/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..c9c711b678e953d7589ad106c2aef064bd906ae6 --- /dev/null +++ b/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml @@ -0,0 +1,57 @@ +__include__: [ + '../../dataset/coco_detection.yml', + '../../runtime.yml', + '../../base/dataloader.yml', + '../../base/optimizer.yml', + '../../base/dfine_hgnetv2.yml', +] + +output_dir: ./outputs/dfine_hgnetv2_x_obj2coco + +HGNetv2: + name: 'B5' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + +DFINETransformer: + feat_channels: [384, 384, 384] + reg_scale: 8 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0000025 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + +epoches: 36 # Early stop +train_dataloader: + dataset: + transforms: + policy: + epoch: 30 + collate_fn: + stop_epoch: 30 + ema_restart_decay: 0.9999 + base_size_repeat: 3 + +ema: + warmups: 0 + +lr_warmup_scheduler: + warmup_duration: 0 diff --git a/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml b/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..538a9afb8b9e1187d3ebf2cccd963d6eec35fd70 --- /dev/null +++ b/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml @@ -0,0 +1,36 @@ +__include__: [ + './rtdetrv2_r101vd_6x_coco.yml', + '../base/rt_deim.yml', +] + +output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.000002 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# change part +epoches: 60 +flat_epoch: 34 # 4 + 60 / 2 +no_aug_epoch: 2 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 34, 58] # list + + collate_fn: + mixup_epochs: [4, 34] + stop_epoch: 58 diff --git a/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml b/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..2b2069fa727453a3f34f9a67c5b9f59ffce7773d --- /dev/null +++ b/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml @@ -0,0 +1,32 @@ +__include__: [ + './rtdetrv2_r18vd_120e_coco.yml', + '../base/rt_deim.yml', +] + +output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# change part +epoches: 120 +flat_epoch: 64 # 4 + 120 / 2 +no_aug_epoch: 3 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 64, 117] # list + + collate_fn: + mixup_epochs: [4, 64] + stop_epoch: 117 \ No newline at end of file diff --git a/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml b/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..fb9d23f72a49df713c5d0c4e91ed5e1b95d57e73 --- /dev/null +++ b/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml @@ -0,0 +1,36 @@ +__include__: [ + './rtdetrv2_r34vd_120e_coco.yml', + '../base/rt_deim.yml', +] + +output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.0001 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +# change part +epoches: 120 +flat_epoch: 64 +no_aug_epoch: 3 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 64, 117] # list + + collate_fn: + mixup_epochs: [4, 64] + stop_epoch: 117 \ No newline at end of file diff --git a/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml b/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..7427c57248f0f2740512fa65b6e5642d2da99709 --- /dev/null +++ b/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml @@ -0,0 +1,35 @@ +__include__: [ + './rtdetrv2_r50vd_6x_coco.yml', + '../base/rt_deim.yml', +] + +output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00002 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# change part +epoches: 60 +flat_epoch: 34 # 4 + 60 / 2 +no_aug_epoch: 2 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 34, 58] # list + + collate_fn: + mixup_epochs: [4, 34] + stop_epoch: 58 \ No newline at end of file diff --git a/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml b/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..b08bdc3713cb32fc23fc95ea9fa37f94fc7feb43 --- /dev/null +++ b/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml @@ -0,0 +1,39 @@ +__include__: [ + './rtdetrv2_r50vd_m_7x_coco.yml', + '../base/rt_deim.yml', +] + +output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco + +RTDETRTransformerv2: + eval_idx: 2 # use 3th decoder layer to eval + num_layers: 3 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00002 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# change part +epoches: 60 +flat_epoch: 34 # 4 + 60 / 2 +no_aug_epoch: 2 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 34, 58] # list + + collate_fn: + mixup_epochs: [4, 34] + stop_epoch: 58 \ No newline at end of file diff --git a/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..256a089b2886fc1830923e957732b7a07bc4273b --- /dev/null +++ b/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml @@ -0,0 +1,40 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/rt_optimizer.yml', + '../base/rtdetrv2_r50vd.yml', +] + + +output_dir: ./outputs/rtdetrv2_r101vd_6x_coco + + +PResNet: + depth: 101 + + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + + +RTDETRTransformerv2: + feat_channels: [384, 384, 384] + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # only encoder + decoder norm + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..04d4d533da9a0bcd6f64e9eb2ddbcdd76ddf8edf --- /dev/null +++ b/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml @@ -0,0 +1,44 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/rt_optimizer.yml', + '../base/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_120e_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + +RTDETRTransformerv2: + num_layers: 3 + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..9cfb522a1870818e2411d863b40f5f35b2289b12 --- /dev/null +++ b/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml @@ -0,0 +1,57 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/rt_optimizer.yml', + '../base/rtdetrv2_r50vd.yml', +] + + +output_dir: ./outputs/rtdetrv2_r34vd_120e_coco + + +PResNet: + depth: 34 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 4 + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00005 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.00005 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + stop_epoch: 117 diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ffe8505be0e15b5f22f088d9215da86fdabc970 --- /dev/null +++ b/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml @@ -0,0 +1,25 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/rt_optimizer.yml', + '../base/rtdetrv2_r50vd.yml', +] + + +output_dir: ./outputs/rtdetrv2_r50vd_6x_coco + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml b/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..760a3866760fd487a4788a9d9efa249e5b65d6a4 --- /dev/null +++ b/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml @@ -0,0 +1,43 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/rt_optimizer.yml', + '../base/rtdetrv2_r50vd.yml', +] + +output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco + + +HybridEncoder: + expansion: 0.5 + + +RTDETRTransformerv2: + eval_idx: 2 # use 3th decoder layer to eval + + +epoches: 84 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 81 + collate_fn: + stop_epoch: 81 \ No newline at end of file diff --git a/configs/deimv2/deimv2_dinov3_l_coco.yml b/configs/deimv2/deimv2_dinov3_l_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..7bb97cb7bbcad41c59ae8d9f01994862296d79b5 --- /dev/null +++ b/configs/deimv2/deimv2_dinov3_l_coco.yml @@ -0,0 +1,104 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + + +output_dir: ./outputs/deimv2_dinov3_l_coco + +DEIM: + backbone: DINOv3STAs + +DINOv3STAs: + name: dinov3_vits16 + weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth + interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32] + finetune: True + conv_inplane: 32 + hidden_dim: 224 + +HybridEncoder: + in_channels: [224, 224, 224] + hidden_dim: 224 + dim_feedforward: 896 + +DEIMTransformer: + feat_channels: [224, 224, 224] + hidden_dim: 224 + num_layers: 4 + eval_idx: -1 + dim_feedforward: 1792 + +## DEIM LR-Scheduler +epoches: 68 # 72 + 2n # Increase to search for the optimal ema + +lrsheduler: flatcosine +lr_gamma: 0.5 +warmup_iter: 2000 +flat_epoch: 34 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +## Optimizer +optimizer: + type: AdamW + params: + - + # except norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' + lr: 0.0000125 + - + # including norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' + lr: 0.0000125 + weight_decay: 0. + - + # including norm/bn/bias except for the self.dinov3 + params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + +## Dense O2O: Mosaic + Mixup + CopyBlend +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 34, 60] # list + + collate_fn: + mixup_epochs: [4, 34] + stop_epoch: 60 + copyblend_epochs: [4, 60] + base_size_repeat: 3 + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + +## DEIM Loss +DEIMCriterion: + matcher: + matcher_change_epoch: 50 \ No newline at end of file diff --git a/configs/deimv2/deimv2_dinov3_m_coco.yml b/configs/deimv2/deimv2_dinov3_m_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..c4b17334138bad9e8bd431e858b50abc6c2514b3 --- /dev/null +++ b/configs/deimv2/deimv2_dinov3_m_coco.yml @@ -0,0 +1,107 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + +output_dir: ./outputs/deimv2_dinov3_m_coco + +DEIM: + backbone: DINOv3STAs + +DINOv3STAs: + name: vit_tinyplus + embed_dim: 256 + weights_path: ./ckpts/vittplus_distill.pt + interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32] + num_heads: 4 + +HybridEncoder: + in_channels: [256, 256, 256] + depth_mult: 1 + expansion: 0.67 + hidden_dim: 256 + dim_feedforward: 512 + + +DEIMTransformer: + feat_channels: [256, 256, 256] + hidden_dim: 256 + dim_feedforward: 512 + num_layers: 4 # 4 5 6 + eval_idx: -1 # -2 -3 -4 + +optimizer: + type: AdamW + + params: + - + # except norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' + lr: 0.000025 + - + # including norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' + lr: 0.000025 + weight_decay: 0. + - + # including norm/bn/bias except for the self.dinov3 + params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +epoches: 102 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + + +## Our DataAug +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 49, 90] # list + + collate_fn: + mixup_prob: 0.5 + ema_restart_decay: 0.9999 + base_size_repeat: 6 + mixup_epochs: [4, 49] + stop_epoch: 90 + copyblend_epochs: [4, 90] + + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + +DEIMCriterion: + matcher: + # new matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 80 diff --git a/configs/deimv2/deimv2_dinov3_s_coco.yml b/configs/deimv2/deimv2_dinov3_s_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..c8ec7ea3b95902fcd88592cbd5ce478a09b788c0 --- /dev/null +++ b/configs/deimv2/deimv2_dinov3_s_coco.yml @@ -0,0 +1,108 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + +output_dir: ./outputs/deimv2_dinov3_s_coco + +DEIM: + backbone: DINOv3STAs + +DINOv3STAs: + name: vit_tiny + embed_dim: 192 + weights_path: ./ckpts/vitt_distill.pt + interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32] + num_heads: 3 + +HybridEncoder: + in_channels: [192, 192, 192] + depth_mult: 0.67 + expansion: 0.34 + hidden_dim: 192 + dim_feedforward: 512 + +DEIMTransformer: + feat_channels: [192, 192, 192] + hidden_dim: 192 + dim_feedforward: 512 + num_layers: 4 # 4 5 6 + eval_idx: -1 # -2 -3 -4 + + +## Optimizer +optimizer: + type: AdamW + + params: + - + # except norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' + lr: 0.000025 + - + # including all norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' + lr: 0.000025 + weight_decay: 0. + - + # including all norm/bn/bias except for the self.dinov3 + params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# Increase to search for the optimal ema +epoches: 132 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + +## Our DataAug +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 64, 120] # list + + collate_fn: + base_size: 640 + mixup_prob: 0.5 + ema_restart_decay: 0.9999 + base_size_repeat: 20 + mixup_epochs: [4, 64] + stop_epoch: 120 + copyblend_epochs: [4, 120] + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + +DEIMCriterion: + matcher: + # change matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 100 diff --git a/configs/deimv2/deimv2_dinov3_x_coco.yml b/configs/deimv2/deimv2_dinov3_x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..f85120b4905dc25318f564d749fd2ba789811fed --- /dev/null +++ b/configs/deimv2/deimv2_dinov3_x_coco.yml @@ -0,0 +1,94 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + + +output_dir: ./outputs/deimv2_dinov3_x_coco + +DEIM: + backbone: DINOv3STAs + +DINOv3STAs: + name: dinov3_vits16plus + weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth + interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32] + finetune: True + conv_inplane: 64 + hidden_dim: 256 + +HybridEncoder: + in_channels: [256, 256, 256] + # intra + hidden_dim: 256 + dim_feedforward: 1024 + + # cross + expansion: 1.25 + depth_mult: 1.37 + +DEIMTransformer: + num_layers: 6 + eval_idx: -1 + feat_channels: [256, 256, 256] + # reg_scale: 8 + hidden_dim: 256 + dim_feedforward: 2048 + +optimizer: + type: AdamW + params: + - + # except norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' + lr: 0.00001 + - + # including norm/bn/bias in self.dinov3 + params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' + lr: 0.00001 + weight_decay: 0. + - + # including norm/bn/bias except for the self.dinov3 + params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 + +## Dense O2O: Mosaic + Mixup + CopyBlend +train_dataloader: + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 29, 50] # list + + collate_fn: + mixup_epochs: [4, 29] + stop_epoch: 50 + copyblend_epochs: [4, 50] + base_size_repeat: 3 + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_atto_coco.yml b/configs/deimv2/deimv2_hgnetv2_atto_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..4d770494fd5b99bb50199ae809920c57a97d98a7 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_atto_coco.yml @@ -0,0 +1,123 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + +output_dir: ./outputs/deimv2_hgnetv2_atto_coco + +DEIM: + encoder: LiteEncoder + +HGNetv2: + name: 'Atto' + return_idx: [2] + freeze_at: -1 + freeze_norm: False + use_lab: True + +LiteEncoder: + in_channels: [256] + feat_strides: [16] + # intra + hidden_dim: 64 + + # cross + expansion: 0.34 + depth_mult: 0.5 + act: 'silu' + + +DEIMTransformer: + feat_channels: [64, 64] + feat_strides: [16, 32] + hidden_dim: 64 + num_levels: 2 + num_points: [4, 2] + + num_layers: 3 + eval_idx: -1 + num_queries: 100 + + # FFN + dim_feedforward: 160 + + # New options for DEIMv2 + share_bbox_head: True + use_gateway: False + +# Increase to search for the optimal ema +epoches: 500 # 468 + 32 + +## Our LR-Scheduler +warmup_iter: 4000 +flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 32 +lr_gamma: 0.5 + +optimizer: + type: AdamW + params: + - params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.001 + - params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.001 + weight_decay: 0. + - params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # except bias + weight_decay: 0. + + lr: 0.002 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +eval_spatial_size: [320, 320] +train_dataloader: + total_batch_size: 128 + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 12} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [320, 320], } + - {type: SanitizeBoundingBoxes, min_size: 12} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 250, 400] # list + mosaic_prob: 0.3 + + collate_fn: + mixup_prob: 0.0 + mixup_epochs: [40000, 15000] + copyblend_prob: 0.0 + copyblend_epochs: [40000, 15000] + + stop_epoch: 468 # 468 + 32 + ema_restart_decay: 0.9999 + base_size: 320 + base_size_repeat: ~ + +val_dataloader: + total_batch_size: 256 + dataset: + transforms: + ops: + - {type: Resize, size: [320, 320], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + num_workers: 16 + + +DEIMCriterion: + losses: ['mal', 'boxes'] # , 'local' + use_uni_set: False + + matcher: + matcher_change_epoch: 450 # FIX This \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_femto_coco.yml b/configs/deimv2/deimv2_hgnetv2_femto_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..7a9a2952da71aa0d01a5fd5cc28b57b346f49837 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_femto_coco.yml @@ -0,0 +1,128 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + +output_dir: ./outputs/deimv2_hgnetv2_femto_coco + +DEIM: + encoder: LiteEncoder + +HGNetv2: + name: 'Femto' + return_idx: [2] + freeze_at: -1 + freeze_norm: False + use_lab: True + +LiteEncoder: + in_channels: [512] + feat_strides: [16] + + # intra + hidden_dim: 96 + + # cross + expansion: 0.34 + depth_mult: 0.5 + act: 'silu' + + +DEIMTransformer: + feat_channels: [96, 96] + feat_strides: [16, 32] + hidden_dim: 96 + num_levels: 2 + num_points: [4, 2] + + num_layers: 3 + eval_idx: -1 + num_queries: 150 + + # FFN + dim_feedforward: 256 + + # New options for DEIMv2 + share_bbox_head: True + use_gateway: False + +# Increase to search for the optimal ema +epoches: 500 # 468 + 32 + +## Our LR-Scheduler +warmup_iter: 4000 +flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 32 +lr_gamma: 0.5 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0008 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0008 + weight_decay: 0. + - # not opt + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0016 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +eval_spatial_size: [416, 416] +train_dataloader: + total_batch_size: 128 + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 10} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [416, 416], } + - {type: SanitizeBoundingBoxes, min_size: 10} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 250, 400] # list + ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + mosaic_prob: 0.5 + + collate_fn: + mixup_prob: 0.0 + mixup_epochs: [40000, 15000] + copyblend_prob: 0.0 + copyblend_epochs: [40000, 15000] + + stop_epoch: 468 # 468 + 32 + ema_restart_decay: 0.9999 + base_size: 416 + base_size_repeat: ~ + +val_dataloader: + total_batch_size: 256 + dataset: + transforms: + ops: + - {type: Resize, size: [416, 416], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + num_workers: 16 + + +DEIMCriterion: + losses: ['mal', 'boxes'] # , 'local' + use_uni_set: False + + matcher: + matcher_change_epoch: 450 # FIX This \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_l_coco.yml b/configs/deimv2/deimv2_hgnetv2_l_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..0d94babe4b29ed306c1e1f5b1ab5352379242799 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_l_coco.yml @@ -0,0 +1,24 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml' +] + +output_dir: ./outputs/deimv2_hgnetv2_l_coco + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000025 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 diff --git a/configs/deimv2/deimv2_hgnetv2_m_coco.yml b/configs/deimv2/deimv2_hgnetv2_m_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..d95fcf3bb45c04ab8ca1cd199a7ce47b2a95f683 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_m_coco.yml @@ -0,0 +1,72 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml' +] + +output_dir: ./outputs/deimv2_hgnetv2_m_coco + +HGNetv2: + name: 'B2' + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + +HybridEncoder: + in_channels: [384, 768, 1536] + hidden_dim: 256 + depth_mult: 0.67 + +DEIMTransformer: + num_layers: 4 # 5 6 + eval_idx: -1 # -2 -3 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*bn).*$' + lr: 0.00004 + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0004 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# Increase to search for the optimal ema +epoches: 102 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 49, 90] # list + + collate_fn: + ema_restart_decay: 0.9999 + base_size_repeat: 6 + mixup_epochs: [4, 49] + stop_epoch: 90 + copyblend_prob: 0.5 + copyblend_epochs: [4, 90] + area_threshold: 100 + num_objects: 3 + with_expand: True + expand_ratios: [0.1, 0.25] + +DEIMCriterion: + matcher: + # new matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 80 \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_n_coco.yml b/configs/deimv2/deimv2_hgnetv2_n_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..00ceea488b1eeec50204fb3464c4c725fafcc9fe --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_n_coco.yml @@ -0,0 +1,96 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml' +] + +output_dir: ./outputs/deimv2_hgnetv2_n_coco + +HGNetv2: + name: 'B0' + return_idx: [2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + +HybridEncoder: + in_channels: [512, 1024] + feat_strides: [16, 32] + + # intra + hidden_dim: 128 + use_encoder_idx: [1] + dim_feedforward: 512 + + # cross + expansion: 0.34 + depth_mult: 0.5 + + version: 'dfine' + +DEIMTransformer: + feat_channels: [128, 128] + feat_strides: [16, 32] + hidden_dim: 128 + num_levels: 2 + num_points: [6, 6] + + num_layers: 3 + eval_idx: -1 + + # FFN + dim_feedforward: 512 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0004 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0004 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0008 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# Increase to search for the optimal ema +epoches: 160 # 148 + 12 + +## Our LR-Scheduler +flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 +lr_gamma: 1.0 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 78, 148] # list + + collate_fn: + ema_restart_decay: 0.9999 + base_size_repeat: ~ + mixup_epochs: [4, 78] + stop_epoch: 148 + copyblend_prob: 0.4 + copyblend_epochs: [4, 78] # CP half + area_threshold: 100 + num_objects: 3 + with_expand: True + expand_ratios: [0.1, 0.25] + +DEIMCriterion: + matcher: + # new matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 136 \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_pico_coco.yml b/configs/deimv2/deimv2_hgnetv2_pico_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..71c29f65afd4db99b57d291de5789ad7d7b63240 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_pico_coco.yml @@ -0,0 +1,128 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml', +] + +output_dir: ./outputs/deimv2_hgnetv2_pico_coco + +DEIM: + encoder: LiteEncoder + decoder: DEIMTransformer + +HGNetv2: + name: 'Pico' + return_idx: [2] + freeze_at: -1 + freeze_norm: False + use_lab: True + +LiteEncoder: + in_channels: [512] + feat_strides: [16] + + # intra + hidden_dim: 112 + + # cross + expansion: 0.34 + depth_mult: 0.5 + act: 'silu' + + +DEIMTransformer: + feat_channels: [112, 112] + feat_strides: [16, 32] + hidden_dim: 112 + num_levels: 2 + num_points: [4, 2] + + num_layers: 3 + eval_idx: -1 + num_queries: 200 + + # FFN + dim_feedforward: 320 + + # New options for DEIMv2 + share_bbox_head: True + use_gateway: False + +# Increase to search for the optimal ema +epoches: 500 # 468 + 32 + +## Our LR-Scheduler +warmup_iter: 4000 +flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 32 +lr_gamma: 0.5 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.0008 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.0008 + weight_decay: 0. + - # not opt + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0016 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +eval_spatial_size: [640, 640] +train_dataloader: + total_batch_size: 128 + dataset: + transforms: + ops: + - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], + probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 8} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 8} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + epoch: [4, 250, 400] # list + ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + mosaic_prob: 0.5 + + collate_fn: + mixup_prob: 0.0 + mixup_epochs: [40000, 15000] + copyblend_prob: 0.0 + copyblend_epochs: [40000, 15000] + stop_epoch: 468 # 468 + 32 + ema_restart_decay: 0.9999 + base_size: 640 + base_size_repeat: ~ + +val_dataloader: + total_batch_size: 256 + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640], } + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + num_workers: 16 + + +DEIMCriterion: + losses: ['mal', 'boxes'] # , 'local' + use_uni_set: False + + matcher: + matcher_change_epoch: 450 # FIX This \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_s_coco.yml b/configs/deimv2/deimv2_hgnetv2_s_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..b543760f9f759823f81a7abbb6f93858e584aa87 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_s_coco.yml @@ -0,0 +1,76 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml' +] + +output_dir: ./outputs/deimv2_hgnetv2_s_coco + +HGNetv2: + name: 'B0' + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: False + use_lab: True + +HybridEncoder: + in_channels: [256, 512, 1024] + hidden_dim: 256 + depth_mult: 0.34 + expansion: 0.5 + + version: 'dfine' + +DEIMTransformer: + num_layers: 3 # 4 5 6 + eval_idx: -1 # -2 -3 -4 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*bn).*$' + lr: 0.0002 + - + params: '^(?=.*(?:norm|bn)).*$' # except bias + weight_decay: 0. + + lr: 0.0004 + betas: [0.9, 0.999] + weight_decay: 0.0001 + +# Increase to search for the optimal ema +epoches: 132 # 120 + 4n + +## Our LR-Scheduler +flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 12 + +## Our DataAug +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 64, 120] # list + + collate_fn: + ema_restart_decay: 0.9999 + base_size_repeat: 20 + mixup_epochs: [4, 64] + stop_epoch: 120 + copyblend_prob: 0.5 + # copyblend_epochs: [4, 64] # from v11 to v12: copy-paste continues only half epochs + copyblend_epochs: [4, 120] + area_threshold: 100 + num_objects: 3 + with_expand: True + expand_ratios: [0.1, 0.25] + +DEIMCriterion: + matcher: + # new matcher + change_matcher: True + iou_order_alpha: 4.0 + matcher_change_epoch: 100 \ No newline at end of file diff --git a/configs/deimv2/deimv2_hgnetv2_x_coco.yml b/configs/deimv2/deimv2_hgnetv2_x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..0355d6e314a4ef127b1c3bb25d2978a8cbedb4a5 --- /dev/null +++ b/configs/deimv2/deimv2_hgnetv2_x_coco.yml @@ -0,0 +1,60 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + '../base/dataloader.yml', + '../base/optimizer.yml', + '../base/deimv2.yml' +] + +output_dir: ./outputs/deimv2_hgnetv2_x_coco + + +HGNetv2: + name: 'B5' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + +DEIMTransformer: + feat_channels: [384, 384, 384] # [256, 256, 256] + reg_scale: 8 # 4 + + # FFN + dim_feedforward: 2048 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000005 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0005 + betas: [0.9, 0.999] + weight_decay: 0.000125 + +# Increase to search for the optimal ema +epoches: 58 # 72 + 2n + +## Our LR-Scheduler +flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 +no_aug_epoch: 8 + +train_dataloader: + dataset: + transforms: + policy: + epoch: [4, 29, 50] # list + + collate_fn: + ema_restart_decay: 0.9998 + base_size_repeat: 3 diff --git a/configs/deimv2_floorplan.yaml b/configs/deimv2_floorplan.yaml deleted file mode 100644 index 458545930512c119201f0ed7924b4009783e9383..0000000000000000000000000000000000000000 --- a/configs/deimv2_floorplan.yaml +++ /dev/null @@ -1,189 +0,0 @@ -__include__: [ - 'coco_detection.yml', # 同じディレクトリ内 - # '../configs/runtime.yml', # 存在しない場合はコメントアウト - # '../configs/base/dataloader.yml', # 存在しない場合はコメントアウト - # '../configs/base/optimizer.yml', - # '../configs/base/deimv2.yml', # 存在しない場合はコメントアウト -] - -output_dir: ./outputs/deimv2_floorplan - -# モデル定義(engine/core.pyが参照する) -model: - type: DEIM - backbone: - type: DINOv3STAs - name: vit_tiny - weights_path: ./ckpts/vitt_distill.pt - interaction_indexes: [3, 7, 11] - num_heads: 3 - embed_dim: 192 - encoder: - type: HybridEncoder - in_channels: [192, 192, 192] - depth_mult: 0.67 - expansion: 0.34 - hidden_dim: 192 - dim_feedforward: 512 - decoder: - type: DEIMTransformer - feat_channels: [192, 192, 192] - hidden_dim: 192 - dim_feedforward: 512 - num_layers: 4 # 4 5 6 - eval_idx: -1 # -2 -3 -4 - -# ポストプロセッサ定義(engine/core.pyが参照する) -postprocessor: - type: PostProcessor - -# 互換性のため残す(必要に応じて) -DEIM: - backbone: DINOv3STAs - -Model: - num_classes: 16 - class_names: ["kanki", "kanki_shikaku", "kanki_regisuta", "window1", "window2", "door1", "door2", "bathtub1", "konro1", "sink1", "toilet1", "kasaikeihou1", "kasaikeihou2", "houi1", "houi2", "houi3"] - -# eval_spatial_sizeを明示的に設定(推論時の画像サイズ) -eval_spatial_size: [640, 640] - -DINOv3STAs: - name: vit_tiny - embed_dim: 192 - weights_path: ./ckpts/vitt_distill.pt # 事前学習を使わないなら行ごと削除 - interaction_indexes: [3, 7, 11] - num_heads: 3 - -HybridEncoder: - in_channels: [192, 192, 192] - depth_mult: 0.67 - expansion: 0.34 - hidden_dim: 192 - dim_feedforward: 512 - -DEIMTransformer: - feat_channels: [192, 192, 192] - hidden_dim: 192 - dim_feedforward: 512 - num_layers: 4 # 4 5 6 - eval_idx: -1 # -2 -3 -4 - - -## Optimizer -optimizer: - type: AdamW - - params: - - - # except norm/bn/bias in self.dinov3 - params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' - lr: 0.000025 - - - # including all norm/bn/bias in self.dinov3 - params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' - lr: 0.000025 - weight_decay: 0. - - - # including all norm/bn/bias except for the self.dinov3 - params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' - weight_decay: 0. - - lr: 0.0005 - betas: [0.9, 0.999] - weight_decay: 0.0001 - -epoches: 400 -flat_epoch: 196 -no_aug_epoch: 46 - -# optimizer.ymlから必要な設定を手動で追加 -use_amp: True -use_ema: True -ema: - type: ModelEMA - decay: 0.9999 - warmups: 1000 - start: 0 - -clip_max_norm: 0.1 -sync_bn: True -find_unused_parameters: True - -# 学習率スケジューリング設定 -# CosineAnnealingLR専用設定(パラメータを最小限に) -lr_scheduler: - type: CosineAnnealingLR - T_max: 400 - eta_min: 0.0000001 - -lr_warmup_scheduler: - type: LinearWarmup - warmup_duration: 1000 - -# 既存のflatcosineスケジューラーを無効化 -lrsheduler: null - -# deimv2.ymlのflatcosineスケジューラーも無効化 -lr_gamma: null -warmup_iter: null -flat_epoch: null -no_aug_epoch: null - - -# ---- Data Aug / Loader(図面+640px+OOM対策)---- -train_dataloader: - dataset: - transforms: - ops: - # 640でのピーク抑制のためMosaicは確率低め/スケール幅絞り - - {type: Mosaic, output_size: 640, rotation_range: 8, translation_range: [0.1, 0.1], - scaling_range: [0.9, 1.1], probability: 0.2, fill_value: 0, use_cache: True, - max_cached_images: 20, random_pop: True} - - {type: RandomPhotometricDistort, p: 0.2} - - {type: RandomZoomOut, fill: 0} - - {type: RandomIoUCrop, p: 0.6} - - {type: SanitizeBoundingBoxes, min_size: 1} - - {type: RandomHorizontalFlip} - - {type: RandomRotation, degrees: [90, 180, 270, 360], p: 0.5} # 修正版で有効化 - - {type: Resize, size: [640, 640]} # ★ 640固定 - - {type: SanitizeBoundingBoxes, min_size: 1} - - {type: ConvertPILImage, dtype: 'float32', scale: True} - - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} - - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} - policy: - epoch: [8, 192, 352] # 400epochに合わせて調整 - - collate_fn: # 線画での崩れ防止&メモリ抑制 - ema_restart_decay: 0.9999 - base_size_repeat: 1 # ★ 1にして実質マルチスケールOFF - stop_epoch: 352 # 400epochの90%程度で停止 - copyblend_epochs: [8, 352] # 400epochに合わせて調整 - - # 実装が読む場合のみ有効。読まない場合は base/dataloader.yml や起動引数で制御 - total_batch_size: 4 # ★ まずは 4 に落として安定化 - -val_dataloader: - dataset: - transforms: - ops: - - {type: Resize, size: [640, 640]} - - {type: ConvertPILImage, dtype: 'float32', scale: True} - - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} - total_batch_size: 6 # 評価も同程度に - -DEIMCriterion: - matcher: - change_matcher: True - iou_order_alpha: 4.0 - matcher_change_epoch: 300 - gamma: 1.5 - alpha: 0.75 - weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} - losses: [mal, boxes, local] - -# 出力設定 - 最後のエポック必ず保存 -output: - save_last: true - save_interval: 5 # チェックポイント保存間隔 - checkpoint_freq: 5 # 学習ループでの保存頻度 diff --git a/configs/runtime.yml b/configs/runtime.yml new file mode 100644 index 0000000000000000000000000000000000000000..8397ce1ff91246825e39f0530b544daaa0f891fe --- /dev/null +++ b/configs/runtime.yml @@ -0,0 +1,20 @@ +print_freq: 500 +output_dir: './logs' +checkpoint_freq: 12 + + +sync_bn: True +find_unused_parameters: True + + +use_amp: False +scaler: + type: GradScaler + enabled: True + + +use_ema: False +ema: + type: ModelEMA + decay: 0.9999 + warmups: 1000 diff --git a/engine/__init__.py b/engine/__init__.py index 7278009b9c96970c981f631bbd1702f9328d159f..69baa01f55ae4799118a52fb6290ae7a2006d87a 100644 --- a/engine/__init__.py +++ b/engine/__init__.py @@ -1,13 +1,16 @@ -# engine package -# モジュールをインポートしてレジストリに登録 -from . import backbone -from . import deim -from . import data -from . import optim -from . import misc +""" +Copyright (c) 2024 The DEIM Authors. All Rights Reserved. +""" -# YAMLConfigをエクスポート -from .core.yaml_config import YAMLConfig +# for register purpose +from . import optim +from . import data +from . import deim -__all__ = ['YAMLConfig'] +from .backbone import * +from .backbone import ( + get_activation, + FrozenBatchNorm2d, + freeze_batch_norm2d, +) \ No newline at end of file diff --git a/engine/backbone/vit_tiny.py b/engine/backbone/vit_tiny.py index 50aa2f7b49be54f041c58582eaf99812d7af4023..e00291394d91466f41e9829c1a8b2c5f32e1e862 100644 --- a/engine/backbone/vit_tiny.py +++ b/engine/backbone/vit_tiny.py @@ -6,14 +6,16 @@ Modified from DINOv3 (https://github.com/facebookresearch/dinov3) Modified from https://huggingface.co/spaces/Hila/RobustViT/blob/main/ViT/ViT_new.py """ +import math +import warnings +from functools import partial +from typing import List, Literal, Tuple + +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from functools import partial -import math -import numpy as np -import warnings -from typing import Literal, Tuple +from torch import nn class RopePositionEmbedding(nn.Module): @@ -180,11 +182,11 @@ class Attention(nn.Module): head_dim = dim // num_heads self.scale = head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) + self.attn_drop = attn_drop self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) - def forward(self, x, rope_sincos=None, register_hook=False): + def forward(self, x, rope_sincos=None): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) @@ -200,13 +202,8 @@ class Attention(nn.Module): q = torch.cat((q_cls, q_patch), dim=2) k = torch.cat((k_cls, k_patch), dim=2) - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - if register_hook: attn.register_hook(self.save_attn_gradients) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop) + x = x.transpose(1, 2).reshape([B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x @@ -220,8 +217,8 @@ class Block(nn.Module): self.norm2 = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) - def forward(self, x, rope_sincos=None, register_hook=False): - attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos, register_hook=register_hook) + def forward(self, x, rope_sincos=None): + attn_output = self.attn(self.norm1(x), rope_sincos=rope_sincos) x = x + self.drop_path(attn_output) x = x + self.drop_path(self.mlp(self.norm2(x))) return x @@ -260,7 +257,6 @@ class VisionTransformer(nn.Module): normalize_coords="separate", shift_coords=None, jitter_coords=None, rescale_coords=None, dtype=None, device=None, ) - self.init_weights() def init_weights(self): @@ -286,28 +282,7 @@ class VisionTransformer(nn.Module): def feature_dim(self): return self.embed_dim - def forward_features(self, x, register_hook=False): - B, C, H, W = x.shape - - x_embed = self._model.patch_embed(x) - cls_token = self._model.cls_token.expand(x_embed.shape[0], -1, -1) - x = torch.cat((cls_token, x_embed), dim=1) - - patch_grid_h = H // self.patch_size - patch_grid_w = W // self.patch_size - rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w) - - for blk in self._model.blocks: - x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook) - x = x[:, 1:, :] - return {'features': x.transpose(1, 2).reshape(-1, self.embed_dim, patch_grid_h, patch_grid_w)} - - def forward_pool(self, x): - features = self.forward_features(x)['features'] - pooled_features = features.mean(dim=[2, 3]) - return {'pooled_features': pooled_features} - - def forward(self, x, register_hook=False): + def forward(self, x): outs = [] B, C, H, W = x.shape @@ -320,7 +295,7 @@ class VisionTransformer(nn.Module): rope_sincos = self._model.rope_embed(H=patch_grid_h, W=patch_grid_w) for i, blk in enumerate(self._model.blocks): - x = blk(x, rope_sincos=rope_sincos, register_hook=register_hook) + x = blk(x, rope_sincos=rope_sincos) if i in self.return_layers: outs.append((x[:, 1:], x[:, 0])) return outs diff --git a/engine/core/workspace.py b/engine/core/workspace.py index 1d7bd693d6d89377ed722c8fdebd1a4b94c2f789..2f9d2a146605e7b8c58c92eedbd6f5eb0981372d 100644 --- a/engine/core/workspace.py +++ b/engine/core/workspace.py @@ -6,7 +6,6 @@ Copyright(c) 2023 lyuwenyu. All Rights Reserved. import inspect import importlib import functools -import copy from collections import defaultdict from typing import Any, Dict, Optional, List @@ -14,23 +13,6 @@ from typing import Any, Dict, Optional, List GLOBAL_CONFIG = defaultdict(dict) -def _safe_copy_cfg(obj): - """ - deepcopy that leaves module objects untouched to avoid pickle errors. - """ - if isinstance(obj, dict): - copied = {} - for k, v in obj.items(): - if k == '_pymodule' or inspect.ismodule(v): - copied[k] = v - else: - copied[k] = _safe_copy_cfg(v) - return copied - if isinstance(obj, list): - return [_safe_copy_cfg(v) for v in obj] - return copy.deepcopy(obj) - - def register(dct :Any=GLOBAL_CONFIG, name=None, force=False): """ dct: @@ -110,63 +92,18 @@ def extract_schema(module: type): def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs): """ - Create registered modules from string, type, or config dict. """ - cfg_override = None - if isinstance(type_or_name, dict): - assert 'type' in type_or_name, 'config dict must have `type` key.' - cfg_override = copy.deepcopy(type_or_name) - name = cfg_override.pop('type') - if isinstance(name, type): - name = name.__name__ - elif isinstance(type_or_name, (type, str)): - name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__ - else: - raise AssertionError('create should be modules, name, or config dict.') - - # Check if module is registered in GLOBAL_CONFIG first - if name not in GLOBAL_CONFIG: - raise ValueError( - f'The module {name} is not registered in GLOBAL_CONFIG. ' - f'Make sure the module is imported and registered with @register() decorator. ' - f'Available registered modules: {list(GLOBAL_CONFIG.keys())[:20]}...' - ) - + assert type(type_or_name) in (type, str), 'create should be modules or name.' + + name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__ + if name in global_cfg: if hasattr(global_cfg[name], '__dict__'): return global_cfg[name] - - # Get config from global_cfg if available, otherwise use GLOBAL_CONFIG - if name in global_cfg: - cfg = _safe_copy_cfg(global_cfg[name]) else: - cfg = _safe_copy_cfg(GLOBAL_CONFIG[name]) - # fallback: if merged config lost registry metadata, restore from base GLOBAL_CONFIG - # Always check and restore metadata from GLOBAL_CONFIG if missing, regardless of global_cfg - if name in GLOBAL_CONFIG: - base = _safe_copy_cfg(GLOBAL_CONFIG[name]) - # Restore all metadata fields if they're missing - if '_pymodule' not in cfg: - cfg['_pymodule'] = base.get('_pymodule') - if '_kwargs' not in cfg: - cfg['_kwargs'] = base.get('_kwargs', {}) - if '_inject' not in cfg: - cfg['_inject'] = base.get('_inject', []) - if '_share' not in cfg: - cfg['_share'] = base.get('_share', []) - if '_name' not in cfg: - cfg['_name'] = base.get('_name', name) - - # merge user overrides into registered schema - if cfg_override is not None: - if isinstance(cfg, dict): - _keys = [k for k in list(cfg.keys()) if not k.startswith('_')] - for _arg in _keys: - del cfg[_arg] - cfg.update(cfg.get('_kwargs', {})) - cfg.update(cfg_override) - cfg.update(kwargs) - kwargs = {} + raise ValueError('The module {} is not registered'.format(name)) + + cfg = global_cfg[name] if isinstance(cfg, dict) and 'type' in cfg: _cfg: dict = global_cfg[cfg['type']] @@ -174,41 +111,18 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs): _keys = [k for k in _cfg.keys() if not k.startswith('_')] for _arg in _keys: del _cfg[_arg] - _cfg.update(_cfg.get('_kwargs', {})) # restore default args + _cfg.update(_cfg['_kwargs']) # restore default args _cfg.update(cfg) # load config args _cfg.update(kwargs) name = _cfg.pop('type') # pop extra key `type` (from cfg) return create(name, global_cfg) - # Safety check: ensure _pymodule exists before accessing it - if '_pymodule' not in cfg: - if name in GLOBAL_CONFIG: - base = _safe_copy_cfg(GLOBAL_CONFIG[name]) - cfg['_pymodule'] = base.get('_pymodule') - cfg.setdefault('_kwargs', base.get('_kwargs', {})) - cfg.setdefault('_inject', base.get('_inject', [])) - cfg.setdefault('_share', base.get('_share', [])) - else: - raise ValueError( - f'The module {name} is not properly registered. ' - f'Missing _pymodule metadata. Make sure the module is imported and registered with @register() decorator.' - ) - - if cfg['_pymodule'] is None: - raise ValueError( - f'The module {name} has None _pymodule. ' - f'This indicates a registration issue. Make sure the module is properly imported.' - ) - module = getattr(cfg['_pymodule'], name) module_kwargs = {} module_kwargs.update(cfg) # shared var - # Safety check: ensure _share exists - if '_share' not in cfg: - cfg['_share'] = [] for k in cfg['_share']: if k in global_cfg: module_kwargs[k] = global_cfg[k] @@ -216,9 +130,6 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs): module_kwargs[k] = cfg[k] # inject - # Safety check: ensure _inject exists - if '_inject' not in cfg: - cfg['_inject'] = [] for k in cfg['_inject']: _k = cfg[k] @@ -244,12 +155,12 @@ def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs): if _type not in global_cfg: raise ValueError(f'Missing {_type} in inspect stage.') - _cfg: dict = _safe_copy_cfg(global_cfg[_type]) + _cfg: dict = global_cfg[_type] # clean args _keys = [k for k in _cfg.keys() if not k.startswith('_')] for _arg in _keys: del _cfg[_arg] - _cfg.update(_cfg.get('_kwargs', {})) # restore default values + _cfg.update(_cfg['_kwargs']) # restore default values _cfg.update(_k) # load config args name = _cfg.pop('type') # pop extra key (`type` from _k) module_kwargs[k] = create(name, global_cfg) diff --git a/engine/core/yaml_utils.py b/engine/core/yaml_utils.py index 214f3d938aa46c447379247fc293a511992cb21d..411e416d4f2ceb3e54c130860be7f765ed0e28af 100644 --- a/engine/core/yaml_utils.py +++ b/engine/core/yaml_utils.py @@ -113,17 +113,7 @@ def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: dct[k] = another[k] elif isinstance(dct[k], dict) and isinstance(another[k], dict): - # 登録済みスキーマ(_pymoduleなどを持つ)を優先する - # anotherが登録済みスキーマの場合、メタデータを保持 - if '_pymodule' in another[k] and '_pymodule' not in dct[k]: - # 登録済みスキーマのメタデータを保持 - base_meta = {key: val for key, val in another[k].items() if key.startswith('_')} - # YAML設定のパラメータで上書き - _merge(dct[k], another[k]) - # メタデータを復元 - dct[k].update(base_meta) - else: - _merge(dct[k], another[k]) + _merge(dct[k], another[k]) elif overwrite: dct[k] = another[k] diff --git a/engine/data/dataset/coco_dataset.py b/engine/data/dataset/coco_dataset.py index 7faaf1288e5ec4c8f8bca8714968261b93de93fb..83202e74019f07d259ba00989744684d91498e7d 100644 --- a/engine/data/dataset/coco_dataset.py +++ b/engine/data/dataset/coco_dataset.py @@ -11,27 +11,14 @@ import torch.utils.data import torchvision from PIL import Image -try: - import faster_coco_eval - import faster_coco_eval.core.mask as coco_mask - _faster_coco_eval_available = True -except ImportError: - _faster_coco_eval_available = False - # 推論時には不要なので、ダミーオブジェクトを作成 - class DummyFasterCocoEval: - @staticmethod - def init_as_pycocotools(): - pass - faster_coco_eval = DummyFasterCocoEval() - coco_mask = None - +import faster_coco_eval +import faster_coco_eval.core.mask as coco_mask from ._dataset import DetDataset from .._misc import convert_to_tv_tensor from ...core import register torchvision.disable_beta_transforms_warning() -if _faster_coco_eval_available: - faster_coco_eval.init_as_pycocotools() +faster_coco_eval.init_as_pycocotools() Image.MAX_IMAGE_PIXELS = None __all__ = ['CocoDetection'] diff --git a/engine/data/dataset/coco_eval.py b/engine/data/dataset/coco_eval.py index 937807f4b111f8609a3e838bd8e7809775295659..75f6bd8ddde3fa164fbc87f499f3f3a2919f4ee5 100644 --- a/engine/data/dataset/coco_eval.py +++ b/engine/data/dataset/coco_eval.py @@ -11,16 +11,8 @@ import copy import numpy as np import torch -try: - from faster_coco_eval import COCO, COCOeval_faster - import faster_coco_eval.core.mask as mask_util - _faster_coco_eval_available = True -except ImportError: - _faster_coco_eval_available = False - COCO = None - COCOeval_faster = None - mask_util = None - +from faster_coco_eval import COCO, COCOeval_faster +import faster_coco_eval.core.mask as mask_util from ...core import register from ...misc import dist_utils __all__ = ['CocoEvaluator',] diff --git a/engine/data/dataset/coco_utils.py b/engine/data/dataset/coco_utils.py index 1141581b21e27bfba0194c56636b4eb993f05847..6b81b5ea9524618fbbae575b89d42e780dc5a050 100644 --- a/engine/data/dataset/coco_utils.py +++ b/engine/data/dataset/coco_utils.py @@ -9,14 +9,8 @@ import torch import torch.utils.data import torchvision import torchvision.transforms.functional as TVF -try: - import faster_coco_eval.core.mask as coco_mask - from faster_coco_eval import COCO - _faster_coco_eval_available = True -except ImportError: - _faster_coco_eval_available = False - coco_mask = None - COCO = None +import faster_coco_eval.core.mask as coco_mask +from faster_coco_eval import COCO def convert_coco_poly_to_mask(segmentations, height, width): diff --git a/engine/data/transforms/_transforms.py b/engine/data/transforms/_transforms.py index a4442863ce25eed604163117f5ecf0f6e34ceb11..31588df5203041730da89b7231479b5b4fc92f20 100644 --- a/engine/data/transforms/_transforms.py +++ b/engine/data/transforms/_transforms.py @@ -114,55 +114,6 @@ class ConvertBoxes(T.Transform): return inpt -@register() -class RandomRotation(T.Transform): - _transformed_types = ( - PIL.Image.Image, - Image, - Video, - Mask, - BoundingBoxes, - ) - - def __init__(self, degrees, p=1.0) -> None: - super().__init__() - if isinstance(degrees, (int, float)): - degrees = [degrees] - self.degrees = degrees - self.p = p - - def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - if torch.rand(1) >= self.p: - return {"angle": 0} - - angle = torch.tensor(self.degrees)[torch.randint(0, len(self.degrees), (1,))].item() - return {"angle": angle} - - def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: - angle = params["angle"] - if angle == 0: - return inpt - - # 型チェックを追加して、サポートされている型のみを処理 - if hasattr(inpt, '__class__') and inpt.__class__.__name__ in ['Image', 'BoundingBoxes', 'Mask', 'Video']: - return F.rotate(inpt, angle=angle, fill=0) - else: - # サポートされていない型の場合はそのまま返す - return inpt - - def __call__(self, *inputs: Any) -> Any: - if len(inputs) == 1: - return self._transform(inputs[0], self._get_params([inputs[0]])) - else: - params = self._get_params(inputs) - # 各入力に対して個別に変換を適用し、結果をリストに格納 - results = [] - for inpt in inputs: - result = self._transform(inpt, params) - results.append(result) - return tuple(results) - - @register() class ConvertPILImage(T.Transform): _transformed_types = ( diff --git a/engine/deim/hybrid_encoder.py b/engine/deim/hybrid_encoder.py index 2752157c52f21c7eb8842d621dd9a803b637ac6a..77a74725bc966f130837ff7edbaa7a0730497ff6 100644 --- a/engine/deim/hybrid_encoder.py +++ b/engine/deim/hybrid_encoder.py @@ -199,9 +199,10 @@ class RepNCSPELAN4(nn.Module): super().__init__() self.c = c3//2 self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act) - CSPLayer_cls = CSPLayer2 if csp_type == 'csp2' else CSPLayer - self.cv2 = nn.Sequential(CSPLayer_cls(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act)) - self.cv3 = nn.Sequential(CSPLayer_cls(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act)) + if csp_type == 'csp2': + CSPLayer = CSPLayer2 + self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act)) + self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act)) self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act) def forward_chunk(self, x): diff --git a/tools/benchmark/dataset.py b/tools/benchmark/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..76fa6491bac0719b0ad0bffe844d51a900038227 --- /dev/null +++ b/tools/benchmark/dataset.py @@ -0,0 +1,105 @@ +""" +Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import glob +from PIL import Image + +import torch +import torch.utils.data as data +import torchvision +import torchvision.transforms as T +import torchvision.transforms.functional as F + +Image.MAX_IMAGE_PIXELS = None + +class ToTensor(T.ToTensor): + def __init__(self) -> None: + super().__init__() + + def __call__(self, pic): + if isinstance(pic, torch.Tensor): + return pic + return super().__call__(pic) + +class PadToSize(T.Pad): + def __init__(self, size, fill=0, padding_mode='constant'): + super().__init__(0, fill, padding_mode) + self.size = size + self.fill = fill + + def __call__(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be padded. + + Returns: + PIL Image or Tensor: Padded image. + """ + w, h = F.get_image_size(img) + padding = (0, 0, self.size[0] - w, self.size[1] - h) + return F.pad(img, padding, self.fill, self.padding_mode) + + +class Dataset(data.Dataset): + def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None: + super().__init__() + + self.device = device + self.size = 640 + + self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg'))) + + if preprocess is None: + self.preprocess = T.Compose([ + T.Resize(size=639, max_size=640), + PadToSize(size=(640, 640), fill=114), + ToTensor(), + T.ConvertImageDtype(torch.float), + ]) + else: + self.preprocess = preprocess + + def __len__(self, ): + return len(self.im_path_list) + + def __getitem__(self, index): + # im = Image.open(self.img_path_list[index]).convert('RGB') + im = torchvision.io.read_file(self.im_path_list[index]) + im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device) + _, h, w = im.shape # c,h,w + + im = self.preprocess(im) + + blob = { + 'images': im, + 'im_shape': torch.tensor([self.size, self.size]).to(im.device), + 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device), + 'orig_target_sizes': torch.tensor([w, h]).to(im.device), + } + + return blob + + @staticmethod + def post_process(): + pass + + @staticmethod + def collate_fn(): + pass + + +def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''): + '''show result + Keys: + 'num_dets', 'det_boxes', 'det_scores', 'det_classes' + ''' + for i in range(blob['image'].shape[0]): + det_scores = outputs['det_scores'][i] + det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold] + + im = (blob['image'][i] * 255).to(torch.uint8) + im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2) + Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg') diff --git a/tools/benchmark/get_info.py b/tools/benchmark/get_info.py new file mode 100644 index 0000000000000000000000000000000000000000..b72efa35b599f2bd8d4b8440e505ba5c6ec8f2ca --- /dev/null +++ b/tools/benchmark/get_info.py @@ -0,0 +1,50 @@ +""" +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) + +import argparse +from calflops import calculate_flops +from engine.core import YAMLConfig + +import torch +import torch.nn as nn + +def custom_repr(self): + return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}' +original_repr = torch.Tensor.__repr__ +torch.Tensor.__repr__ = custom_repr + +def main(args, ): + """main + """ + cfg = YAMLConfig(args.config, resume=None) + class Model_for_flops(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + + def forward(self, images): + outputs = self.model(images) + return outputs + + model = Model_for_flops().eval() + + flops, macs, _ = calculate_flops(model=model, + input_shape=(1, 3, 640, 640), + output_as_string=True, + output_precision=4) + params = sum(p.numel() for p in model.parameters()) + print("Model FLOPs:%s MACs:%s Params:%s \n" %(flops, macs, params)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', default= "configs/dfine/dfine_hgnetv2_l_coco.yml", type=str) + args = parser.parse_args() + + main(args) diff --git a/tools/benchmark/requirements.txt b/tools/benchmark/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..55a3c0f0a63f0ad81e1d41a19753d7c550231c27 --- /dev/null +++ b/tools/benchmark/requirements.txt @@ -0,0 +1,6 @@ +onnxruntime +tensorrt +pycuda +calflops +tqdm +# onnx_graphsurgeon # for YOLOs diff --git a/tools/benchmark/trt_benchmark.py b/tools/benchmark/trt_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..a650ac06f62b74034ee39a9ae66edb046486ad7c --- /dev/null +++ b/tools/benchmark/trt_benchmark.py @@ -0,0 +1,207 @@ +""" +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import tensorrt as trt +import pycuda.driver as cuda +from utils import TimeProfiler +import numpy as np +import os +import time +import torch + +from collections import namedtuple, OrderedDict +import glob +import argparse +from dataset import Dataset +from tqdm import tqdm + + +def parse_args(): + parser = argparse.ArgumentParser(description='Argument Parser Example') + parser.add_argument('--COCO_dir', + type=str, + default='/data/COCO2017/val2017', + help="Directory for images to perform inference on.") + parser.add_argument("--engine_dir", + type=str, + help="Directory containing model engine files.") + parser.add_argument('--busy', + action='store_true', + help="Flag to indicate that other processes may be running.") + args = parser.parse_args() + return args + +class TRTInference(object): + def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False): + self.engine_path = engine_path + self.device = device + self.backend = backend + self.max_batch_size = max_batch_size + + self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) + self.engine = self.load_engine(engine_path) + self.context = self.engine.create_execution_context() + self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device) + self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items()) + self.input_names = self.get_input_names() + self.output_names = self.get_output_names() + + if self.backend == 'cuda': + self.stream = cuda.Stream() + self.time_profile = TimeProfiler() + self.time_profile_dataset = TimeProfiler() + + def init(self): + self.dynamic = False + + def load_engine(self, path): + trt.init_libnvinfer_plugins(self.logger, '') + with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def get_input_names(self): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + names.append(name) + return names + + def get_output_names(self): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: + names.append(name) + return names + + def get_bindings(self, engine, context, max_batch_size=32, device=None): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) + bindings = OrderedDict() + for i, name in enumerate(engine): + shape = engine.get_tensor_shape(name) + dtype = trt.nptype(engine.get_tensor_dtype(name)) + + if shape[0] == -1: + dynamic = True + shape[0] = max_batch_size + if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + context.set_input_shape(name, shape) + + if self.backend == 'cuda': + if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + data = np.random.randn(*shape).astype(dtype) + ptr = cuda.mem_alloc(data.nbytes) + bindings[name] = Binding(name, dtype, shape, data, ptr) + else: + data = cuda.pagelocked_empty(trt.volume(shape), dtype) + ptr = cuda.mem_alloc(data.nbytes) + bindings[name] = Binding(name, dtype, shape, data, ptr) + else: + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) + bindings[name] = Binding(name, dtype, shape, data, data.data_ptr()) + return bindings + + def run_torch(self, blob): + for n in self.input_names: + if self.bindings[n].shape != blob[n].shape: + self.context.set_input_shape(n, blob[n].shape) + self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape) + + self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names}) + self.context.execute_v2(list(self.bindings_addr.values())) + outputs = {n: self.bindings[n].data for n in self.output_names} + return outputs + + def async_run_cuda(self, blob): + for n in self.input_names: + cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream) + + bindings_addr = [int(v) for _, v in self.bindings_addr.items()] + self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle) + + outputs = {} + for n in self.output_names: + cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream) + outputs[n] = self.bindings[n].data + + self.stream.synchronize() + + return outputs + + def __call__(self, blob): + if self.backend == 'torch': + return self.run_torch(blob) + elif self.backend == 'cuda': + return self.async_run_cuda(blob) + + def synchronize(self): + if self.backend == 'torch' and torch.cuda.is_available(): + torch.cuda.synchronize() + elif self.backend == 'cuda': + self.stream.synchronize() + + def warmup(self, blob, n): + for _ in range(n): + _ = self(blob) + + def speed(self, blob, n, nonempty_process=False): + times = [] + self.time_profile_dataset.reset() + for i in tqdm(range(n), desc="Running Inference", unit="iteration"): + self.time_profile.reset() + with self.time_profile_dataset: + img = blob[i] + if img['images'] is not None: + img['image'] = img['input'] = img['images'].unsqueeze(0) + else: + img['images'] = img['input'] = img['image'].unsqueeze(0) + with self.time_profile: + _ = self(img) + times.append(self.time_profile.total) + + # end-to-end model only + times = sorted(times) + if len(times) > 100 and nonempty_process: + times = times[:100] + + avg_time = sum(times) / len(times) # Calculate the average of the remaining times + return avg_time + +def main(): + FLAGS = parse_args() + dataset = Dataset(FLAGS.infer_dir) + im = torch.ones(1, 3, 640, 640).cuda() + blob = { + 'image': im, + 'images': im, + 'input': im, + 'im_shape': torch.tensor([640, 640]).to(im.device), + 'scale_factor': torch.tensor([1, 1]).to(im.device), + 'orig_target_sizes': torch.tensor([640, 640]).to(im.device), + } + + engine_files = glob.glob(os.path.join(FLAGS.models_dir, "*.engine")) + results = [] + + for engine_file in engine_files: + print(f"Testing engine: {engine_file}") + model = TRTInference(engine_file, max_batch_size=1, verbose=False) + model.init() + model.warmup(blob, 1000) + t = [] + for _ in range(1): + t.append(model.speed(dataset, 1000, FLAGS.busy)) + avg_latency = 1000 * torch.tensor(t).mean() + results.append((engine_file, avg_latency)) + print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms") + + del model + torch.cuda.empty_cache() + time.sleep(1) + + sorted_results = sorted(results, key=lambda x: x[1]) + for engine_file, latency in sorted_results: + print(f"Engine: {engine_file}, Latency: {latency:.2f} ms") + +if __name__ == '__main__': + main() diff --git a/tools/benchmark/utils.py b/tools/benchmark/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..23e1800a2deaf84f3fec46c1a40b6e29f4772719 --- /dev/null +++ b/tools/benchmark/utils.py @@ -0,0 +1,80 @@ +import time +import contextlib +import numpy as np +from PIL import Image +from collections import OrderedDict + +import onnx +import torch +import onnx_graphsurgeon + + +def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'): + '''--loadInputs='image:input_tensor.bin' + ''' + im = Image.open(path).resize(size) + data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255. + data.tofile(output_name) + + +def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False): + ''' + http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html + https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py + ''' + onnx_model = onnx.load(path) + + if simplify: + from onnxsim import simplify + onnx_model, _ = simplify(onnx_model, overwrite_input_shapes={'image': [1, 3, 640, 640]}) + + graph = onnx_graphsurgeon.import_onnx(onnx_model) + graph.toposort() + graph.fold_constants() + graph.cleanup() + + topk = max_output_boxes + attrs = OrderedDict(plugin_version='1', + background_class=-1, + max_output_boxes=topk, + score_threshold=score_threshold, + iou_threshold=iou_threshold, + score_activation=False, + box_coding=0, ) + + outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]), + onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]), + onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]), + onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])] + + graph.layer(op='EfficientNMS_TRT', + name="batched_nms", + inputs=[graph.outputs[0], + graph.outputs[1]], + outputs=outputs, + attrs=attrs, ) + + graph.outputs = outputs + graph.cleanup().toposort() + + onnx.save(onnx_graphsurgeon.export_onnx(graph), 'yolo_w_nms.onnx') + + +class TimeProfiler(contextlib.ContextDecorator): + def __init__(self, ): + self.total = 0 + + def __enter__(self, ): + self.start = self.time() + return self + + def __exit__(self, type, value, traceback): + self.total += self.time() - self.start + + def reset(self, ): + self.total = 0 + + def time(self, ): + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() diff --git a/tools/dataset/remap_obj365.py b/tools/dataset/remap_obj365.py new file mode 100644 index 0000000000000000000000000000000000000000..f76214e7a05b5c158deaecbe4f4994e020bf8226 --- /dev/null +++ b/tools/dataset/remap_obj365.py @@ -0,0 +1,139 @@ +""" +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import json +import os +import argparse + + +def update_image_paths(images, new_prefix): + print('Updating image paths with new prefix...') + for img in images: + split = img['file_name'].split('/')[1:] + img['file_name'] = os.path.join(new_prefix, *split) + print('Image paths updated.') + return images + +def create_split_annotations(original_annotations, split_image_ids, new_prefix, output_file): + print(f'Creating split annotations for {output_file}...') + new_images = [img for img in original_annotations['images'] if img['id'] in split_image_ids] + print(f'Number of images selected: {len(new_images)}') + if new_prefix is not None: + new_images = update_image_paths(new_images, new_prefix) + + new_annotations = { + 'images': new_images, + 'annotations': [ann for ann in original_annotations['annotations'] if ann['image_id'] in split_image_ids], + 'categories': original_annotations['categories'] + } + print(f'Number of annotations selected: {len(new_annotations["annotations"])}') + with open(output_file, 'w') as f: + json.dump(new_annotations, f) + print(f'Annotations saved to {output_file}') + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Split and update dataset annotations.') + parser.add_argument( + '--base_dir', + type=str, + default='/datassd/objects365', + help='Base directory of the dataset, e.g., /data/Objects365/data' + ) + parser.add_argument( + '--new_val_size', + type=int, + default=5000, + help='Number of images to include in the new validation set (default: 5000)' + ) + parser.add_argument( + '--output_suffix', + type=str, + default='new', + help='Suffix to add to new annotation files (default: new)' + ) + return parser.parse_args() + +def main(): + args = parse_arguments() + base_dir = args.base_dir + new_val_size = args.new_val_size + output_suffix = args.output_suffix + + # Define paths based on the base directory + original_train_ann_file = os.path.join(base_dir, 'train', 'zhiyuan_objv2_train.json') + original_val_ann_file = os.path.join(base_dir, 'val', 'zhiyuan_objv2_val.json') + + new_val_ann_file = os.path.join(base_dir, 'val', f'{output_suffix}_zhiyuan_objv2_val.json') + new_train_ann_file = os.path.join(base_dir, 'train', f'{output_suffix}_zhiyuan_objv2_train.json') + + # Check if original annotation files exist + if not os.path.isfile(original_train_ann_file): + print(f'Error: Training annotation file not found at {original_train_ann_file}') + return + if not os.path.isfile(original_val_ann_file): + print(f'Error: Validation annotation file not found at {original_val_ann_file}') + return + + # Load the original training and validation annotations + print('Loading original training annotations...') + with open(original_train_ann_file, 'r') as f: + train_annotations = json.load(f) + print('Training annotations loaded.') + + print('Loading original validation annotations...') + with open(original_val_ann_file, 'r') as f: + val_annotations = json.load(f) + print('Validation annotations loaded.') + + # Extract image IDs from the original validation set + print('Extracting image IDs from the validation set...') + val_image_ids = [img['id'] for img in val_annotations['images']] + print(f'Total validation images: {len(val_image_ids)}') + + # Split image IDs for the new training and validation sets + print(f'Splitting validation images into new validation set of size {new_val_size} and training set...') + new_val_image_ids = val_image_ids[:new_val_size] + new_train_image_ids = val_image_ids[new_val_size:] + print(f'New validation set size: {len(new_val_image_ids)}') + print(f'New training set size from validation images: {len(new_train_image_ids)}') + + # Create new validation annotation file + print('Creating new validation annotations...') + create_split_annotations(val_annotations, new_val_image_ids, None, new_val_ann_file) + print('New validation annotations created.') + + # Combine the remaining validation images and annotations with the original training data + print('Preparing new training images and annotations...') + new_train_images = [img for img in val_annotations['images'] if img['id'] in new_train_image_ids] + print(f'Number of images from validation to add to training: {len(new_train_images)}') + new_train_images = update_image_paths(new_train_images, 'images_from_val') + new_train_annotations = [ann for ann in val_annotations['annotations'] if ann['image_id'] in new_train_image_ids] + print(f'Number of annotations from validation to add to training: {len(new_train_annotations)}') + + # Add the original training images and annotations + print('Adding original training images and annotations...') + new_train_images.extend(train_annotations['images']) + new_train_annotations.extend(train_annotations['annotations']) + print(f'Total training images: {len(new_train_images)}') + print(f'Total training annotations: {len(new_train_annotations)}') + + # Create a new training annotation dictionary + print('Creating new training annotations dictionary...') + new_train_annotations_dict = { + 'images': new_train_images, + 'annotations': new_train_annotations, + 'categories': train_annotations['categories'] + } + print('New training annotations dictionary created.') + + # Save the new training annotations + print('Saving new training annotations...') + with open(new_train_ann_file, 'w') as f: + json.dump(new_train_annotations_dict, f) + print(f'New training annotations saved to {new_train_ann_file}') + + print('Processing completed successfully.') + +if __name__ == '__main__': + main() diff --git a/tools/dataset/resize_obj365.py b/tools/dataset/resize_obj365.py new file mode 100644 index 0000000000000000000000000000000000000000..d14fd865ef5e260d0d6e8b26f32daddc8c088b69 --- /dev/null +++ b/tools/dataset/resize_obj365.py @@ -0,0 +1,147 @@ +""" +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import os +import json +from PIL import Image +from concurrent.futures import ThreadPoolExecutor +import argparse + + +def resize_image_and_update_annotations(image_path, annotations, max_size=640): + print(f"Processing image: {image_path}") + try: + with Image.open(image_path) as img: + w, h = img.size + if max(w, h) <= max_size: + return annotations, w, h, False # No need to resize + + scale = max_size / max(w, h) + new_w = int(w * scale) + new_h = int(h * scale) + print(f"Resizing image to width={new_w}, height={new_h}") + + img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) + new_image_path = image_path.replace('.jpg', '_resized{}.jpg'.format(max_size)) + img.save(new_image_path) + print(f"Resized image saved: {new_image_path}") + print(f"Original size: ({w}, {h}), New size: ({new_w}, {new_h})") + + # Update annotations + for ann in annotations: + ann['area'] = ann['area'] * (scale ** 2) + ann['bbox'] = [coord * scale for coord in ann['bbox']] + if 'orig_size' in ann: + ann['orig_size'] = (new_w, new_h) + if 'size' in ann: + ann['size'] = (new_w, new_h) + + except Exception as e: + print(f"Error processing {image_path}: {e}") + return None + + return annotations, new_w, new_h, True + +def resize_images_and_update_annotations(base_dir, subset, max_size=640, num_workers=4): + print(f"Starting to resize images and update annotations for subset: {subset}") + json_file = os.path.join(base_dir, subset, 'new_zhiyuan_objv2_{}.json'.format(subset)) + if not os.path.isfile(json_file): + print(f'Error: JSON file not found at {json_file}') + return + + print(f"Loading JSON file: {json_file}") + with open(json_file, 'r') as f: + data = json.load(f) + print("JSON file loaded.") + + print("Preparing image annotations mapping...") + image_annotations = {img['id']: [] for img in data['images']} + for ann in data['annotations']: + image_annotations[ann['image_id']].append(ann) + print("Image annotations mapping prepared.") + + def process_image(image_info): + image_path = os.path.join(base_dir, subset, image_info['file_name']) + results = resize_image_and_update_annotations(image_path, image_annotations[image_info['id']], max_size) + if results is None: + updated_annotations, new_w, new_h, resized = None, None, None, None + else: + updated_annotations, new_w, new_h, resized = results + return image_info, updated_annotations, new_w, new_h, resized + + print(f"Processing images with {num_workers} worker threads...") + with ThreadPoolExecutor(max_workers=num_workers) as executor: + results = list(executor.map(process_image, data['images'])) + print("Image processing completed.") + + new_images = [] + new_annotations = [] + + print("Updating image and annotation data...") + for image_info, updated_annotations, new_w, new_h, resized in results: + if updated_annotations is not None: + image_info['width'] = new_w + image_info['height'] = new_h + image_annotations[image_info['id']] = updated_annotations + if resized: + image_info['file_name'] = image_info['file_name'].replace('.jpg', '_resized{}.jpg'.format(max_size)) + new_images.append(image_info) + new_annotations.extend(updated_annotations) + print(f"Total images processed: {len(new_images)}") + print(f"Total annotations updated: {len(new_annotations)}") + + new_data = { + 'images': new_images, + 'annotations': new_annotations, + 'categories': data['categories'] + } + + new_json_file = json_file.replace('.json', '_resized{}.json'.format(max_size)) + print('Saving new training annotations...') + with open(new_json_file, 'w') as f: + json.dump(new_data, f) + print(f'New JSON file saved to {new_json_file}') + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Resize images and update dataset annotations for both train and val sets.') + parser.add_argument( + '--base_dir', + type=str, + default='/datassd/objects365', + help='Base directory of the dataset, e.g., /data/Objects365/data' + ) + parser.add_argument( + '--max_size', + type=int, + default=640, + help='Maximum size for the longer side of the image (default: 640)' + ) + parser.add_argument( + '--num_workers', + type=int, + default=4, + help='Number of worker threads for parallel processing (default: 4)' + ) + args = parser.parse_args() + return args + +def main(): + args = parse_arguments() + base_dir = args.base_dir + max_size = args.max_size + num_workers = args.num_workers + + subsets = ['train', 'val'] + for subset in subsets: + print(f'Processing subset: {subset}') + resize_images_and_update_annotations( + base_dir=base_dir, + subset=subset, + max_size=max_size, + num_workers=num_workers + ) + print("All subsets processed.") + +if __name__ == "__main__": + main() diff --git a/tools/deployment/export_onnx.py b/tools/deployment/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..186fda35f2a319dc841bcd2d38752df5c851b1b6 --- /dev/null +++ b/tools/deployment/export_onnx.py @@ -0,0 +1,109 @@ +""" +DEIMv2: Real-Time Object Detection Meets DINOv3 +Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) +Copyright (c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) + +import torch +import torch.nn as nn + +from engine.core import YAMLConfig + + +def main(args, ): + """main + """ + cfg = YAMLConfig(args.config, resume=args.resume) + + if 'HGNetv2' in cfg.yaml_cfg: + cfg.yaml_cfg['HGNetv2']['pretrained'] = False + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + + else: + # raise AttributeError('Only support resume to load model.state_dict by now.') + print('not load model.state_dict, use default init state dict...') + + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + model = Model() + + img_size = cfg.yaml_cfg["eval_spatial_size"] + data = torch.rand(32, 3, *img_size) + size = torch.tensor([img_size]) + _ = model(data, size) + + dynamic_axes = { + 'images': {0: 'N', }, + 'orig_target_sizes': {0: 'N'} + } + + output_file = args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx' + + torch.onnx.export( + model, + (data, size), + output_file, + input_names=['images', 'orig_target_sizes'], + output_names=['labels', 'boxes', 'scores'], + dynamic_axes=dynamic_axes, + opset_version=args.opset, + verbose=False, + do_constant_folding=True, + ) + + if args.check: + import onnx + onnx_model = onnx.load(output_file) + onnx.checker.check_model(onnx_model) + print('Check export onnx model done...') + + if args.simplify: + import onnx + import onnxsim + dynamic = True + # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None + input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None + onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes) + onnx.save(onnx_model_simplify, output_file) + print(f'Simplify onnx model {check}...') + + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', default='configs/dfine/dfine_hgnetv2_l_coco.yml', type=str, ) + parser.add_argument('--resume', '-r', type=str, ) + parser.add_argument('--opset', type=int, default=17,) + parser.add_argument('--check', action='store_true') + parser.add_argument('--simplify', action='store_true') + args = parser.parse_args() + main(args) diff --git a/tools/deployment/export_yolo_w_nms.py b/tools/deployment/export_yolo_w_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..95c89b213d9436cba49c093d4427757b907a5e03 --- /dev/null +++ b/tools/deployment/export_yolo_w_nms.py @@ -0,0 +1,74 @@ +import torch +import torchvision + +import numpy as np +import onnxruntime as ort + +from utils import yolo_insert_nms + +class YOLO11(torch.nn.Module): + def __init__(self, name) -> None: + super().__init__() + from ultralytics import YOLO + # Load a model + # build a new model from scratch + # model = YOLO(f'{name}.yaml') + + # load a pretrained model (recommended for training) + model = YOLO("yolo11n.pt") + self.model = model.model + + def forward(self, x): + '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216 + ''' + pred: torch.Tensor = self.model(x)[0] # n 84 8400, + pred = pred.permute(0, 2, 1) + boxes, scores = pred.split([4, 80], dim=-1) + boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') + + return boxes, scores + + + +def export_onnx(name='yolov8n'): + '''export onnx + ''' + m = YOLO11(name) + + x = torch.rand(1, 3, 640, 640) + dynamic_axes = { + 'image': {0: '-1'} + } + torch.onnx.export(m, x, f'{name}.onnx', + input_names=['image'], + output_names=['boxes', 'scores'], + opset_version=13, + dynamic_axes=dynamic_axes) + + data = np.random.rand(1, 3, 640, 640).astype(np.float32) + sess = ort.InferenceSession(f'{name}.onnx') + _ = sess.run(output_names=None, input_feed={'image': data}) + + import onnx + import onnxslim + model_onnx = onnx.load(f'{name}.onnx') + model_onnx = onnxslim.slim(model_onnx) + onnx.save(model_onnx, f'{name}.onnx') + + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--name', type=str, default='yolo11n_tuned') + parser.add_argument('--score_threshold', type=float, default=0.01) + parser.add_argument('--iou_threshold', type=float, default=0.6) + parser.add_argument('--max_output_boxes', type=int, default=300) + args = parser.parse_args() + + export_onnx(name=args.name) + + yolo_insert_nms(path=f'{args.name}.onnx', + score_threshold=args.score_threshold, + iou_threshold=args.iou_threshold, + max_output_boxes=args.max_output_boxes, ) diff --git a/tools/inference/onnx_inf.py b/tools/inference/onnx_inf.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0016019cd43a9b8d64c454493c7c4283a5b0d4 --- /dev/null +++ b/tools/inference/onnx_inf.py @@ -0,0 +1,175 @@ +""" +DEIMv2: Real-Time Object Detection Meets DINOv3 +Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +Modified from D-FINE (https://github.com/Peterande/D-FINE) +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import cv2 +import numpy as np +import onnxruntime as ort +import torch +import torchvision.transforms as T +from PIL import Image, ImageDraw + + +def resize_with_aspect_ratio(image, size, interpolation=Image.BILINEAR): + """Resizes an image while maintaining aspect ratio and pads it.""" + original_width, original_height = image.size + ratio = min(size / original_width, size / original_height) + new_width = int(original_width * ratio) + new_height = int(original_height * ratio) + image = image.resize((new_width, new_height), interpolation) + + # Create a new image with the desired size and paste the resized image onto it + new_image = Image.new("RGB", (size, size)) + new_image.paste(image, ((size - new_width) // 2, (size - new_height) // 2)) + return new_image, ratio, (size - new_width) // 2, (size - new_height) // 2 + + +def draw(images, labels, boxes, scores, ratios, paddings, thrh=0.4): + result_images = [] + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + scr = scr[scr > thrh] + + ratio = ratios[i] + pad_w, pad_h = paddings[i] + + for lbl, bb in zip(lab, box): + # Adjust bounding boxes according to the resizing and padding + bb = [ + (bb[0] - pad_w) / ratio, + (bb[1] - pad_h) / ratio, + (bb[2] - pad_w) / ratio, + (bb[3] - pad_h) / ratio, + ] + draw.rectangle(bb, outline='red') + draw.text((bb[0], bb[1]), text=str(lbl), fill='blue') + + result_images.append(im) + return result_images + + +def process_image(sess, im_pil, size=640, model_size='s'): + # Resize image while preserving aspect ratio + resized_im_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(im_pil, size) + orig_size = torch.tensor([[resized_im_pil.size[1], resized_im_pil.size[0]]]) + + transforms = T.Compose([ + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if model_size not in ['atto', 'femto', 'pico', 'n'] + else T.Lambda(lambda x: x) + ]) + im_data = transforms(resized_im_pil).unsqueeze(0) + + output = sess.run( + output_names=None, + input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()} + ) + + labels, boxes, scores = output + + result_images = draw( + [im_pil], labels, boxes, scores, + [ratio], [(pad_w, pad_h)] + ) + result_images[0].save('onnx_result.jpg') + print("Image processing complete. Result saved as 'result.jpg'.") + + +def process_video(sess, video_path, size=640, model_size='s'): + cap = cv2.VideoCapture(video_path) + + # Get video properties + fps = cap.get(cv2.CAP_PROP_FPS) + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # Define the codec and create VideoWriter object + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter('onnx_result.mp4', fourcc, fps, (orig_w, orig_h)) + + frame_count = 0 + print("Processing video frames...") + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + # Convert frame to PIL image + frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + + # Resize frame while preserving aspect ratio + resized_frame_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(frame_pil, size) + orig_size = torch.tensor([[resized_frame_pil.size[1], resized_frame_pil.size[0]]]) + + transforms = T.Compose([ + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if model_size not in ['atto', 'femto', 'pico', 'n'] + else T.Lambda(lambda x: x) + ]) + im_data = transforms(resized_frame_pil).unsqueeze(0) + + output = sess.run( + output_names=None, + input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()} + ) + + labels, boxes, scores = output + + # Draw detections on the original frame + result_images = draw( + [frame_pil], labels, boxes, scores, + [ratio], [(pad_w, pad_h)] + ) + frame_with_detections = result_images[0] + + # Convert back to OpenCV image + frame = cv2.cvtColor(np.array(frame_with_detections), cv2.COLOR_RGB2BGR) + + # Write the frame + out.write(frame) + frame_count += 1 + + if frame_count % 10 == 0: + print(f"Processed {frame_count} frames...") + + cap.release() + out.release() + print("Video processing complete. Result saved as 'result.mp4'.") + + +def main(args): + """Main function.""" + # Load the ONNX model + sess = ort.InferenceSession(args.onnx) + size = sess.get_inputs()[0].shape[2] + print(f"Using device: {ort.get_device()}") + + input_path = args.input + + try: + # Try to open the input as an image + im_pil = Image.open(input_path).convert('RGB') + process_image(sess, im_pil, size, args.model_size) + except IOError: + # Not an image, process as video + process_video(sess, input_path, size, args.model_size) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--onnx', type=str, required=True, help='Path to the ONNX model file.') + parser.add_argument('--input', type=str, required=True, help='Path to the input image or video file.') + parser.add_argument('-ms', '--model-size', type=str, required=True, choices=['atto', 'femto', 'pico', 'n', 's', 'm', 'l', 'x'], + help='Model size') + args = parser.parse_args() + main(args) diff --git a/tools/inference/openvino_inf.py b/tools/inference/openvino_inf.py new file mode 100644 index 0000000000000000000000000000000000000000..4a66755a256f56a5594508d003d1820df23fd2e3 --- /dev/null +++ b/tools/inference/openvino_inf.py @@ -0,0 +1,7 @@ +""" +Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO diff --git a/tools/inference/requirements.txt b/tools/inference/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..46a470c8805cfc80a3bf4c4cb6a87dacc237021f --- /dev/null +++ b/tools/inference/requirements.txt @@ -0,0 +1,2 @@ +onnxruntime +tensorrt diff --git a/tools/inference/torch_inf.py b/tools/inference/torch_inf.py new file mode 100644 index 0000000000000000000000000000000000000000..86e090016bc54870482d9bb67a22af5b1ef27227 --- /dev/null +++ b/tools/inference/torch_inf.py @@ -0,0 +1,167 @@ +""" +DEIMv2: Real-Time Object Detection Meets DINOv3 +Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +Modified from D-FINE (https://github.com/Peterande/D-FINE) +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import os +import sys + +import cv2 # Added for video processing +import numpy as np +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image, ImageDraw + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from engine.core import YAMLConfig + + +def draw(images, labels, boxes, scores, thrh=0.45): + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + scrs = scr[scr > thrh] + + for j, b in enumerate(box): + draw.rectangle(list(b), outline='red') + draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", fill='blue', ) + + im.save('torch_results.jpg') + + +def process_image(model, device, file_path, size=(640, 640), vit_backbone=False): + im_pil = Image.open(file_path).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([[w, h]]).to(device) + + transforms = T.Compose([ + T.Resize(size), + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if vit_backbone else T.Lambda(lambda x: x) + ]) + im_data = transforms(im_pil).unsqueeze(0).to(device) + + output = model(im_data, orig_size) + labels, boxes, scores = output + + draw([im_pil], labels, boxes, scores) + + +def process_video(model, device, file_path, size=(640, 640), vit_backbone=False): + cap = cv2.VideoCapture(file_path) + + # Get video properties + fps = cap.get(cv2.CAP_PROP_FPS) + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # Define the codec and create VideoWriter object + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h)) + + transforms = T.Compose([ + T.Resize(size), + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if vit_backbone else T.Lambda(lambda x: x) + ]) + + frame_count = 0 + print("Processing video frames...") + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + # Convert frame to PIL image + frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + + w, h = frame_pil.size + orig_size = torch.tensor([[w, h]]).to(device) + + im_data = transforms(frame_pil).unsqueeze(0).to(device) + + output = model(im_data, orig_size) + labels, boxes, scores = output + + # Draw detections on the frame + draw([frame_pil], labels, boxes, scores) + + # Convert back to OpenCV image + frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR) + + # Write the frame + out.write(frame) + frame_count += 1 + + if frame_count % 10 == 0: + print(f"Processed {frame_count} frames...") + + cap.release() + out.release() + print("Video processing complete. Result saved as 'results_video.mp4'.") + + +def main(args): + """Main function""" + cfg = YAMLConfig(args.config, resume=args.resume) + + if 'HGNetv2' in cfg.yaml_cfg: + cfg.yaml_cfg['HGNetv2']['pretrained'] = False + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('Only support resume to load model.state_dict by now.') + + # Load train mode state and convert to deploy mode + cfg.model.load_state_dict(state) + + class Model(nn.Module): + def __init__(self): + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + device = args.device + model = Model().to(device) + img_size = cfg.yaml_cfg["eval_spatial_size"] + vit_backbone = cfg.yaml_cfg.get('DINOv3STAs', False) + + # Check if the input file is an image or a video + file_path = args.input + if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']: + # Process as image + process_image(model, device, file_path, img_size, vit_backbone) + print("Image processing complete.") + else: + # Process as video + process_video(model, device, file_path, img_size, vit_backbone) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, required=True) + parser.add_argument('-r', '--resume', type=str, required=True) + parser.add_argument('-i', '--input', type=str, required=True) + parser.add_argument('-d', '--device', type=str, default='cpu') + args = parser.parse_args() + main(args) diff --git a/tools/inference/torch_inf_vis.py b/tools/inference/torch_inf_vis.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5ef632c84a1e1fe69441d05aa7aaceb5423f38 --- /dev/null +++ b/tools/inference/torch_inf_vis.py @@ -0,0 +1,155 @@ +""" +DEIMv2: Real-Time Object Detection Meets DINOv3 +Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +Modified from D-FINE (https://github.com/Peterande/D-FINE) +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import os +import random +import sys + +import cv2 # Added for video processing +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image, ImageDraw, ImageFont + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from engine.core import YAMLConfig + +label_map = { + 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane', + 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight', + 11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter', + 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', + 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', + 25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe', + 30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', + 35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat', + 40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket', + 44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork', + 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', + 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog', + 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa', + 64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window', + 69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop', + 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave', + 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender', + 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear', + 89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush' +} + + +COLORS = plt.cm.tab20.colors +COLOR_MAP = {label: tuple([int(c * 255) for c in COLORS[i % len(COLORS)]]) for i, label in enumerate(label_map)} + + + +def draw(image, labels, boxes, scores, thrh=0.45): + draw = ImageDraw.Draw(image) + font = ImageFont.load_default() + labels, boxes, scores = labels[scores > thrh], boxes[scores > thrh], scores[scores > thrh] + + for j, box in enumerate(boxes): + category = labels[j].item() + color = COLOR_MAP.get(category, (255, 255, 255)) + box = list(map(int, box)) + + + draw.rectangle(box, outline=color, width=3) + + text = f"{label_map[category]} {scores[j].item():.2f}" + text_bbox = draw.textbbox((0, 0), text, font=font) + text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1] + + text_background = [box[0], box[1] - text_height - 2, box[0] + text_width + 4, box[1]] + draw.rectangle(text_background, fill=color) + + draw.text((box[0] + 2, box[1] - text_height - 2), text, fill="black", font=font) + + return image + + +def process_dataset(model, dataset_path, output_path, thrh=0.5, size=(640, 640), vit_backbone=False): + os.makedirs(output_path, exist_ok=True) + image_paths = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith(('.jpg', '.png'))] + + transforms = T.Compose([ + T.Resize(size), + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if vit_backbone else T.Lambda(lambda x: x) + ]) + + print(f"Found {len(image_paths)} images in validation set...") + for idx, file_path in enumerate(image_paths): + im_pil = Image.open(file_path).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([[w, h]]).cuda() + + # 图像预处理 + im_data = transforms(im_pil).unsqueeze(0).cuda() + output = model(im_data, orig_size) + labels, boxes, scores = output[0]['labels'], output[0]['boxes'], output[0]['scores'] + + # 绘制结果 + vis_image = draw(im_pil.copy(), labels, boxes, scores, thrh) + save_path = os.path.join(output_path, f"vis_{os.path.basename(file_path)}") + vis_image.save(save_path) + + if idx % 500 == 0: + print(f"Processed {idx}/{len(image_paths)} images...") + + print("Visualization complete. Results saved in:", output_path) + + +def main(args): + """Main function""" + cfg = YAMLConfig(args.config, resume=args.resume) + + if 'HGNetv2' in cfg.yaml_cfg: + cfg.yaml_cfg['HGNetv2']['pretrained'] = False + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('Only support resume to load model.state_dict by now.') + + # Load train mode state and convert to deploy mode + cfg.model.load_state_dict(state) + + class Model(nn.Module): + def __init__(self): + super().__init__() + self.model = cfg.model.eval().cuda() + self.postprocessor = cfg.postprocessor.eval().cuda() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + model = Model() + img_size = cfg.yaml_cfg["eval_spatial_size"] + vit_backbone = cfg.yaml_cfg.get('DINOv3STAs', False) + + process_dataset(model, args.dataset, args.output, thrh=0.45, size=img_size, vit_backbone=vit_backbone) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, required=True) + parser.add_argument('-r', '--resume', type=str, required=True) + parser.add_argument('-d', '--dataset', type=str, default='./data/fiftyone/validation/data') + parser.add_argument('-o', '--output', type=str, required=True, help="Path to save visualized results") + args = parser.parse_args() + main(args) diff --git a/tools/inference/trt_inf.py b/tools/inference/trt_inf.py new file mode 100644 index 0000000000000000000000000000000000000000..e98e0560b14f885df3ffd9ce86720227332fdb8c --- /dev/null +++ b/tools/inference/trt_inf.py @@ -0,0 +1,242 @@ +""" +DEIMv2: Real-Time Object Detection Meets DINOv3 +Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. +--------------------------------------------------------------------------------- +Modified from D-FINE (https://github.com/Peterande/D-FINE) +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import collections +import contextlib +import os +import time +from collections import OrderedDict + +import cv2 # Added for video processing +import numpy as np +import tensorrt as trt +import torch +import torchvision.transforms as T +from PIL import Image, ImageDraw + + +class TimeProfiler(contextlib.ContextDecorator): + def __init__(self): + self.total = 0 + + def __enter__(self): + self.start = self.time() + return self + + def __exit__(self, type, value, traceback): + self.total += self.time() - self.start + + def reset(self): + self.total = 0 + + def time(self): + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() + +class TRTInference(object): + def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False): + self.engine_path = engine_path + self.device = device + self.backend = backend + self.max_batch_size = max_batch_size + + self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) + + self.engine = self.load_engine(engine_path) + self.context = self.engine.create_execution_context() + self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device) + self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items()) + self.input_names = self.get_input_names() + self.output_names = self.get_output_names() + self.time_profile = TimeProfiler() + + def load_engine(self, path): + trt.init_libnvinfer_plugins(self.logger, '') + with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def get_input_names(self): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + names.append(name) + return names + + def get_output_names(self): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: + names.append(name) + return names + + def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict: + Binding = collections.namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) + bindings = OrderedDict() + + for i, name in enumerate(engine): + shape = engine.get_tensor_shape(name) + dtype = trt.nptype(engine.get_tensor_dtype(name)) + + if shape[0] == -1: + shape[0] = max_batch_size + if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + context.set_input_shape(name, shape) + + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) + bindings[name] = Binding(name, dtype, shape, data, data.data_ptr()) + + return bindings + + def run_torch(self, blob): + for n in self.input_names: + if self.bindings[n].shape != blob[n].shape: + self.context.set_input_shape(n, blob[n].shape) + self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape) + + assert self.bindings[n].data.dtype == blob[n].dtype, '{} dtype mismatch'.format(n) + + self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names}) + self.context.execute_v2(list(self.bindings_addr.values())) + outputs = {n: self.bindings[n].data for n in self.output_names} + + return outputs + + def __call__(self, blob): + if self.backend == 'torch': + return self.run_torch(blob) + else: + raise NotImplementedError("Only 'torch' backend is implemented.") + + def synchronize(self): + if self.backend == 'torch' and torch.cuda.is_available(): + torch.cuda.synchronize() + +def draw(images, labels, boxes, scores, thrh=0.4): + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + scrs = scr[scr > thrh] + + for j, b in enumerate(box): + draw.rectangle(list(b), outline='red') + draw.text( + (b[0], b[1]), + text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", + fill='blue', + ) + + return images + +def process_image(m, file_path, device, size=(640, 640), model_size='s'): + im_pil = Image.open(file_path).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([w, h])[None].to(device) + + transforms = T.Compose([ + T.Resize(size), + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if model_size not in ['atto', 'femto', 'pico', 'n'] + else T.Lambda(lambda x: x) + ]) + im_data = transforms(im_pil)[None] + + blob = { + 'images': im_data.to(device), + 'orig_target_sizes': orig_size.to(device), + } + + output = m(blob) + result_images = draw([im_pil], output['labels'], output['boxes'], output['scores']) + result_images[0].save('trt_result.jpg') + print("Image processing complete. Result saved as 'result.jpg'.") + +def process_video(m, file_path, device, size=(640, 640), model_size='s'): + cap = cv2.VideoCapture(file_path) + + # Get video properties + fps = cap.get(cv2.CAP_PROP_FPS) + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # Define the codec and create VideoWriter object + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter('trt_result.mp4', fourcc, fps, (orig_w, orig_h)) + + transforms = T.Compose([ + T.Resize(size), + T.ToTensor(), + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + if model_size not in ['atto', 'femto', 'pico', 'n'] + else T.Lambda(lambda x: x) + ]) + + frame_count = 0 + print("Processing video frames...") + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + # Convert frame to PIL image + frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + + w, h = frame_pil.size + orig_size = torch.tensor([w, h])[None].to(device) + + im_data = transforms(frame_pil)[None] + + blob = { + 'images': im_data.to(device), + 'orig_target_sizes': orig_size.to(device), + } + + output = m(blob) + + # Draw detections on the frame + result_images = draw([frame_pil], output['labels'], output['boxes'], output['scores']) + + # Convert back to OpenCV image + frame = cv2.cvtColor(np.array(result_images[0]), cv2.COLOR_RGB2BGR) + + # Write the frame + out.write(frame) + frame_count += 1 + + if frame_count % 10 == 0: + print(f"Processed {frame_count} frames...") + + cap.release() + out.release() + print("Video processing complete. Result saved as 'result_video.mp4'.") + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-trt', '--trt', type=str, required=True) + parser.add_argument('-i', '--input', type=str, required=True) + parser.add_argument('-d', '--device', type=str, default='cuda:0') + parser.add_argument('-s', '--size', type=int, required=True, help='input size, e.g., 640') + parser.add_argument('-ms', '--model-size', type=str, required=True, choices=['atto', 'femto', 'pico', 'n', 's', 'm', 'l', 'x']) + + + args = parser.parse_args() + + m = TRTInference(args.trt, device=args.device) + size = (args.size,) * 2 + + file_path = args.input + if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']: + # Process as image + process_image(m, file_path, args.device, size, args.model_size) + else: + # Process as video + process_video(m, file_path, args.device, size, args.model_size) diff --git a/tools/reference/convert_weight.py b/tools/reference/convert_weight.py new file mode 100644 index 0000000000000000000000000000000000000000..9651d19a98b181658400137a74bcaf39be088567 --- /dev/null +++ b/tools/reference/convert_weight.py @@ -0,0 +1,29 @@ +import torch +import os +import argparse + +def save_only_ema_weights(checkpoint_file): + """Extract and save only the EMA weights.""" + checkpoint = torch.load(checkpoint_file, map_location='cpu') + + weights = {} + if 'ema' in checkpoint: + weights['model'] = checkpoint['ema']['module'] + else: + raise ValueError("The checkpoint does not contain 'ema'.") + + dir_name, base_name = os.path.split(checkpoint_file) + name, ext = os.path.splitext(base_name) + output_file = os.path.join(dir_name, f"{name}_converted{ext}") + + torch.save(weights, output_file) + print(f"EMA weights saved to {output_file}") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Extract and save only EMA weights.") + parser.add_argument('checkpoint_dir', type=str, help="Path to the input checkpoint file.") + + args = parser.parse_args() + for file in os.listdir(args.checkpoint_dir): + if '.pth' in file and '_converted' not in file: + save_only_ema_weights(os.path.join(args.checkpoint_dir, file)) diff --git a/tools/reference/safe_training.sh b/tools/reference/safe_training.sh new file mode 100644 index 0000000000000000000000000000000000000000..d3c752a48f27511a353d65dbb9e8f97146ad4817 --- /dev/null +++ b/tools/reference/safe_training.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Function to display the menu for selecting model size +select_model_size() { + echo "Select model size:" + select size in s m l x; do + case $size in + s|m|l|x) + echo "You selected model size: $size" + MODEL_SIZE=$size + break + ;; + *) + echo "Invalid selection. Please try again." + ;; + esac + done +} + +# Function to display the menu for selecting task +select_task() { + echo "Select task:" + select task in obj365 obj2coco coco; do + case $task in + obj365|obj2coco|coco) + echo "You selected task: $task" + TASK=$task + break + ;; + *) + echo "Invalid selection. Please try again." + ;; + esac + done +} + +# Function to ask if the user wants to save logs to a txt file +ask_save_logs() { + while true; do + read -p "Do you want to save logs to a txt file? (y/n): " yn + case $yn in + [Yy]* ) + SAVE_LOGS=true + break + ;; + [Nn]* ) + SAVE_LOGS=false + break + ;; + * ) echo "Please answer yes or no.";; + esac + done +} + +# Call the functions to let the user select +select_model_size +select_task +ask_save_logs + +# Set config file and output directory based on selection +if [ "$TASK" = "coco" ]; then + CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml" +else + CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml" +fi + +OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}" + +# Construct the training command +TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR" + +# Append log redirection if SAVE_LOGS is true +if [ "$SAVE_LOGS" = true ]; then + LOG_FILE="${MODEL_SIZE}_${TASK}.txt" + TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &" +else + TRAIN_CMD="$TRAIN_CMD &" +fi + +# Run the training command +eval $TRAIN_CMD +if [ $? -ne 0 ]; then + echo "First training failed, restarting with resume option..." + while true; do + RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth" + if [ "$SAVE_LOGS" = true ]; then + LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt" + RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &" + else + RESUME_CMD="$RESUME_CMD &" + fi + eval $RESUME_CMD + if [ $? -eq 0 ]; then + break + fi + done +fi diff --git a/tools/visualization/fiftyone_vis.py b/tools/visualization/fiftyone_vis.py new file mode 100644 index 0000000000000000000000000000000000000000..5831293b16c8c77209e97411bee00695332b24e2 --- /dev/null +++ b/tools/visualization/fiftyone_vis.py @@ -0,0 +1,307 @@ +""" +Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. +""" + +import argparse +import os +import subprocess +import sys +import time + +import fiftyone as fo +import fiftyone.core.fields as fof +import fiftyone.core.labels as fol +import fiftyone.core.models as fom +import fiftyone.zoo as foz +import torch +import torchvision.transforms as transforms +import tqdm +from fiftyone import ViewField as F +from PIL import Image + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) +from engine.core import YAMLConfig + + +def kill_existing_mongod(): + try: + result = subprocess.run(['ps', 'aux'], stdout=subprocess.PIPE) + processes = result.stdout.decode('utf-8').splitlines() + + for process in processes: + if 'mongod' in process and '--dbpath' in process: + # find mongod PID + pid = int(process.split()[1]) + print(f"Killing existing mongod process with PID: {pid}") + # kill mongod session + os.kill(pid, 9) + except Exception as e: + print(f"Error occurred while killing mongod: {e}") + +kill_existing_mongod() + + +label_map = { + 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane', + 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight', + 11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter', + 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', + 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', + 25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe', + 30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', + 35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat', + 40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket', + 44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork', + 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', + 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog', + 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa', + 64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window', + 69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop', + 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave', + 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender', + 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear', + 89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush' +} + +class CustomModel(fom.Model): + def __init__(self, cfg): + super().__init__() + self.model = cfg.model.eval().cuda() + self.postprocessor = cfg.postprocessor.eval().cuda() + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Resize((640, 640)), # Resize to the size expected by your model + # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + + @property + def media_type(self): + return "image" + + @property + def has_logits(self): + return False + + @property + def has_embeddings(self): + return False + + @property + def ragged_batches(self): + return False + + @property + def transforms(self): + return None + + @property + def preprocess(self): + return True + + @preprocess.setter + def preprocess(self, value): + pass + + def _convert_predictions(self, predictions): + class_labels, bboxes, scores = predictions[0]['labels'], predictions[0]['boxes'], predictions[0]['scores'] + + detections = [] + for label, bbox, score in zip(class_labels, bboxes, scores): + detection = fol.Detection( + label=label_map[label.item()], + bounding_box=[ + bbox[0] / 640, # Normalized coordinates + bbox[1] / 640, + (bbox[2] - bbox[0]) / 640, + (bbox[3] - bbox[1]) / 640 + ], + confidence=score + ) + detections.append(detection) + + return fol.Detections(detections=detections) + + def predict(self, image): + image = Image.fromarray(image).convert('RGB') + image_tensor = self.transform(image).unsqueeze(0).cuda() + outputs = self.model(image_tensor) + orig_target_sizes = torch.tensor([[640, 640]]).cuda() + predictions = self.postprocessor(outputs, orig_target_sizes) + return self._convert_predictions(predictions) + + def predict_all(self, images): + image_tensors = [] + for image in images: + image = Image.fromarray(image) + image_tensor = self.transform(image) + image_tensors.append(image_tensor) + image_tensors = torch.stack(image_tensors).cuda() + outputs = self.model(image_tensors) + orig_target_sizes = torch.tensor([[640, 640] for image in images]).cuda() + predictions = self.postprocessor(outputs, orig_target_sizes) + converted_predictions = [self._convert_predictions(pred) for pred in predictions] + + # Ensure the output is a list of lists of Detections + return converted_predictions + +def filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3): + for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)): + has_modified = False + for i, detection in enumerate(sample["predictions0"].detections): + + if "original_confidence" not in detection: + detection["original_confidence"] = detection["confidence"] + + if (detection["confidence"] <= confidence_threshold and sample["predictions5"].detections[i]["confidence"] >= confidence_threshold) or \ + (detection["confidence"] >= confidence_threshold and sample["predictions5"].detections[i]["confidence"] <= confidence_threshold): + + sample["predictions0"].detections[i]["confidence"] = sample["predictions5"].detections[i]["confidence"] + has_modified = True + if has_modified: + sample.save() + + +def restore_confidence(predictions_view): + for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)): + for i, detection in enumerate(sample["predictions0"].detections): + if "original_confidence" in detection: + detection["confidence"] = detection["original_confidence"] + sample.save() + +def fast_iou(bbox1, bbox2): + x1, y1, w1, h1 = bbox1 + x2, y2, w2, h2 = bbox2 + xA = max(x1, x2) + yA = max(y1, y2) + xB = min(x1 + w1, x2 + w2) + yB = min(y1 + h1, y2 + h2) + interArea = max(0, xB - xA) * max(0, yB - yA) + boxAArea = w1 * h1 + boxBArea = w2 * h2 + iou = interArea / float(boxAArea + boxBArea - interArea) + return iou + +def assign_iou_diff(predictions_view): + for sample in predictions_view: + ious_0 = [detection.eval0_iou if 'eval0_iou' in detection else None for detection in sample["predictions0"].detections] + ious_5 = [detection.eval5_iou if 'eval5_iou' in detection else None for detection in sample["predictions5"].detections] + bbox_0 = [detection.bounding_box for detection in sample["predictions0"].detections] + bbox_5 = [detection.bounding_box for detection in sample["predictions5"].detections] + # iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None else -1 for iou_0, iou_5 in zip(ious_0, ious_5)] + iou_inter = [fast_iou(b0, b5) for b0, b5 in zip(bbox_0, bbox_5)] + iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None and iou_inter > 0.5 else -1 for iou_0, iou_5, iou_inter in zip(ious_0, ious_5, iou_inter)] + + for detection, iou_diff in zip(sample["predictions0"].detections, iou_diffs): + detection["iou_diff"] = iou_diff + for detection, iou_diff in zip(sample["predictions5"].detections, iou_diffs): + detection["iou_diff"] = iou_diff + # for detection, iou_diff in zip(sample["predictions100"].detections, iou_diffs): + # detection["iou_diff"] = iou_diff + sample.save() + +def main(args): + try: + if os.path.exists("saved_predictions_view") and os.path.exists("saved_filtered_view"): + print("Loading saved predictions and filtered views...") + dataset = foz.load_zoo_dataset( + "coco-2017", + split="validation", + dataset_name="evaluate-detections-tutorial", + dataset_dir="data/fiftyone" + ) + + dataset.persistent = True + session = fo.launch_app(dataset, port=args.port) + + predictions_view = fo.Dataset.from_dir( + dataset_dir="saved_predictions_view", + dataset_type=fo.types.FiftyOneDataset + ).view() + filtered_view = fo.Dataset.from_dir( + dataset_dir="saved_filtered_view", + dataset_type=fo.types.FiftyOneDataset + ).view() + else: + dataset = foz.load_zoo_dataset( + "coco-2017", + split="validation", + dataset_name="evaluate-detections-tutorial", + dataset_dir="data/fiftyone" + ) + + dataset.persistent = True + + session = fo.launch_app(dataset, port=args.port) + cfg = YAMLConfig(args.config, resume=args.resume) + if 'HGNetv2' in cfg.yaml_cfg: + cfg.yaml_cfg['HGNetv2']['pretrained'] = False + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('only support resume to load model.state_dict by now.') + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + predictions_view = dataset.take(500, seed=51) + + model = CustomModel(cfg) + L = model.model.decoder.decoder.eval_idx + # Apply models and save predictions in different label fields + for i in [L]: + model.model.decoder.decoder.eval_idx = i + label_field = "predictions{:d}".format(i) + predictions_view.apply_model(model, label_field=label_field) + + # filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3) + for i in [L]: + label_field = "predictions{:d}".format(i) + predictions_view = predictions_view.filter_labels(label_field, F("confidence") > 0.5, only_matches=False) + eval_key = "eval{:d}".format(i) + _ = predictions_view.evaluate_detections( + label_field, + gt_field="ground_truth", + eval_key=eval_key, + compute_mAP=True, + ) + + # assign_iou_diff(predictions_view) + + # filtered_view = predictions_view.filter_labels("predictions0", F("iou_diff") > 0.05, only_matches=True) + # filtered_view = filtered_view.filter_labels("predictions5", F("iou_diff") > 0.05, only_matches=True) + # restore_confidence(filtered_view) + + predictions_view.export( + export_dir="saved_predictions_view", + dataset_type=fo.types.FiftyOneDataset + ) + # filtered_view.export( + # export_dir="saved_filtered_view", + # dataset_type=fo.types.FiftyOneDataset + # ) + + # Display the filtered view + session.view = predictions_view + + # Keep the session open + while True: + time.sleep(1) + except Exception as e: + print(f"An error occurred: {e}") + finally: + print("Shutting down session") + if 'session' in locals(): + session.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', type=str) + parser.add_argument('--resume', '-r', type=str) + parser.add_argument('--port', '-p', type=int) + args = parser.parse_args() + + main(args)