| auto_scale_lr = dict(base_batch_size=16, enable=True) |
| backend_args = None |
| crop_size = ( |
| 512, |
| 512, |
| ) |
| custom_imports = dict( |
| allow_failed_imports=False, imports=[ |
| 'detection', |
| ]) |
| data_root = 'data/coco/' |
| dataset_type = 'CocoDataset' |
| default_hooks = dict( |
| checkpoint=dict(interval=1, save_best='auto', type='CheckpointHook'), |
| logger=dict(interval=50, type='LoggerHook'), |
| param_scheduler=dict(type='ParamSchedulerHook'), |
| sampler_seed=dict(type='DistSamplerSeedHook'), |
| timer=dict(type='IterTimerHook'), |
| visualization=dict(type='DetVisualizationHook')) |
| default_scope = 'mmdet' |
| depth = 12 |
| env_cfg = dict( |
| cudnn_benchmark=True, |
| dist_cfg=dict(backend='nccl'), |
| mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) |
| hidden_dim = 192 |
| img_size = 512 |
| launcher = 'none' |
| load_from = None |
| log_level = 'INFO' |
| log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) |
| mlp_dim = 768 |
| model = dict( |
| backbone=dict( |
| attention_dropout=0.0, |
| depth=12, |
| dropout=0.0, |
| hidden_dim=192, |
| img_size=512, |
| in_chans=3, |
| init_cfg=dict( |
| checkpoint='checkpoints/rope_vit_imagenet100_best.pth', |
| type='Pretrained'), |
| mlp_dim=768, |
| num_heads=3, |
| out_indices=( |
| 2, |
| 5, |
| 8, |
| 11, |
| ), |
| patch_size=16, |
| pretrain_img_size=224, |
| rope_theta=10.0, |
| type='RoPEViTBackbone'), |
| data_preprocessor=dict( |
| bgr_to_rgb=True, |
| mean=[ |
| 123.675, |
| 116.28, |
| 103.53, |
| ], |
| pad_size_divisor=32, |
| std=[ |
| 58.395, |
| 57.12, |
| 57.375, |
| ], |
| type='DetDataPreprocessor'), |
| neck=dict( |
| backbone_channel=192, |
| norm_cfg=dict(requires_grad=True, type='LN2d'), |
| num_outs=5, |
| out_channels=256, |
| type='SimpleViTFPN'), |
| roi_head=dict( |
| bbox_head=dict( |
| bbox_coder=dict( |
| target_means=[ |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0, |
| ], |
| target_stds=[ |
| 0.1, |
| 0.1, |
| 0.2, |
| 0.2, |
| ], |
| type='DeltaXYWHBBoxCoder'), |
| conv_out_channels=256, |
| fc_out_channels=1024, |
| in_channels=256, |
| loss_bbox=dict(loss_weight=1.0, type='L1Loss'), |
| loss_cls=dict( |
| loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=False), |
| norm_cfg=dict(requires_grad=True, type='LN2d'), |
| num_classes=80, |
| reg_class_agnostic=False, |
| roi_feat_size=7, |
| type='Shared4Conv1FCBBoxHead'), |
| bbox_roi_extractor=dict( |
| featmap_strides=[ |
| 4, |
| 8, |
| 16, |
| 32, |
| ], |
| out_channels=256, |
| roi_layer=dict(output_size=7, sampling_ratio=0, type='RoIAlign'), |
| type='SingleRoIExtractor'), |
| type='StandardRoIHead'), |
| rpn_head=dict( |
| anchor_generator=dict( |
| ratios=[ |
| 0.5, |
| 1.0, |
| 2.0, |
| ], |
| scales=[ |
| 8, |
| ], |
| strides=[ |
| 4, |
| 8, |
| 16, |
| 32, |
| 64, |
| ], |
| type='AnchorGenerator'), |
| bbox_coder=dict( |
| target_means=[ |
| 0.0, |
| 0.0, |
| 0.0, |
| 0.0, |
| ], |
| target_stds=[ |
| 1.0, |
| 1.0, |
| 1.0, |
| 1.0, |
| ], |
| type='DeltaXYWHBBoxCoder'), |
| feat_channels=256, |
| in_channels=256, |
| loss_bbox=dict(loss_weight=1.0, type='L1Loss'), |
| loss_cls=dict( |
| loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=True), |
| num_convs=2, |
| type='RPNHead'), |
| test_cfg=dict( |
| rcnn=dict( |
| max_per_img=100, |
| nms=dict(iou_threshold=0.5, type='nms'), |
| score_thr=0.05), |
| rpn=dict( |
| max_per_img=1000, |
| min_bbox_size=0, |
| nms=dict(iou_threshold=0.7, type='nms'), |
| nms_pre=1000)), |
| train_cfg=dict( |
| rcnn=dict( |
| assigner=dict( |
| ignore_iof_thr=-1, |
| match_low_quality=False, |
| min_pos_iou=0.5, |
| neg_iou_thr=0.5, |
| pos_iou_thr=0.5, |
| type='MaxIoUAssigner'), |
| debug=False, |
| pos_weight=-1, |
| sampler=dict( |
| add_gt_as_proposals=True, |
| neg_pos_ub=-1, |
| num=512, |
| pos_fraction=0.25, |
| type='RandomSampler')), |
| rpn=dict( |
| allowed_border=-1, |
| assigner=dict( |
| ignore_iof_thr=-1, |
| match_low_quality=True, |
| min_pos_iou=0.3, |
| neg_iou_thr=0.3, |
| pos_iou_thr=0.7, |
| type='MaxIoUAssigner'), |
| debug=False, |
| pos_weight=-1, |
| sampler=dict( |
| add_gt_as_proposals=False, |
| neg_pos_ub=-1, |
| num=256, |
| pos_fraction=0.5, |
| type='RandomSampler')), |
| rpn_proposal=dict( |
| max_per_img=1000, |
| min_bbox_size=0, |
| nms=dict(iou_threshold=0.7, type='nms'), |
| nms_pre=2000)), |
| type='FasterRCNN') |
| num_classes = 80 |
| num_heads = 3 |
| optim_wrapper = dict( |
| clip_grad=dict(max_norm=0.1, norm_type=2), |
| optimizer=dict( |
| betas=( |
| 0.9, |
| 0.999, |
| ), lr=0.0001, type='AdamW', weight_decay=0.05), |
| paramwise_cfg=dict(custom_keys=dict(backbone=dict(lr_mult=0.1))), |
| type='AmpOptimWrapper') |
| param_scheduler = [ |
| dict( |
| begin=0, by_epoch=False, end=500, start_factor=0.001, type='LinearLR'), |
| dict( |
| begin=0, |
| by_epoch=True, |
| end=12, |
| gamma=0.1, |
| milestones=[ |
| 8, |
| 11, |
| ], |
| type='MultiStepLR'), |
| ] |
| patch_size = 16 |
| resume = False |
| test_cfg = dict(type='TestLoop') |
| test_dataloader = dict( |
| batch_size=1, |
| dataset=dict( |
| ann_file='annotations/instances_val2017.json', |
| backend_args=None, |
| data_prefix=dict(img='val2017/'), |
| data_root='data/coco/', |
| pipeline=[ |
| dict(backend_args=None, type='LoadImageFromFile'), |
| dict(keep_ratio=True, scale=( |
| 512, |
| 512, |
| ), type='Resize'), |
| dict( |
| pad_val=dict(img=( |
| 114, |
| 114, |
| 114, |
| )), |
| size=( |
| 512, |
| 512, |
| ), |
| type='Pad'), |
| dict(type='LoadAnnotations', with_bbox=True), |
| dict( |
| meta_keys=( |
| 'img_id', |
| 'img_path', |
| 'ori_shape', |
| 'img_shape', |
| 'scale_factor', |
| ), |
| type='PackDetInputs'), |
| ], |
| test_mode=True, |
| type='CocoDataset'), |
| drop_last=False, |
| num_workers=2, |
| persistent_workers=True, |
| sampler=dict(shuffle=False, type='DefaultSampler')) |
| test_evaluator = dict( |
| ann_file='data/coco/annotations/instances_val2017.json', |
| backend_args=None, |
| format_only=False, |
| metric='bbox', |
| type='CocoMetric') |
| test_pipeline = [ |
| dict(backend_args=None, type='LoadImageFromFile'), |
| dict(keep_ratio=True, scale=( |
| 512, |
| 512, |
| ), type='Resize'), |
| dict(pad_val=dict(img=( |
| 114, |
| 114, |
| 114, |
| )), size=( |
| 512, |
| 512, |
| ), type='Pad'), |
| dict(type='LoadAnnotations', with_bbox=True), |
| dict( |
| meta_keys=( |
| 'img_id', |
| 'img_path', |
| 'ori_shape', |
| 'img_shape', |
| 'scale_factor', |
| ), |
| type='PackDetInputs'), |
| ] |
| train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1) |
| train_dataloader = dict( |
| batch_sampler=dict(type='AspectRatioBatchSampler'), |
| batch_size=16, |
| dataset=dict( |
| ann_file='annotations/instances_train2017.json', |
| backend_args=None, |
| data_prefix=dict(img='train2017/'), |
| data_root='data/coco/', |
| filter_cfg=dict(filter_empty_gt=True, min_size=32), |
| pipeline=[ |
| dict(backend_args=None, type='LoadImageFromFile'), |
| dict(type='LoadAnnotations', with_bbox=True), |
| dict(keep_ratio=True, scale=( |
| 512, |
| 512, |
| ), type='Resize'), |
| dict(prob=0.5, type='RandomFlip'), |
| dict( |
| pad_val=dict(img=( |
| 114, |
| 114, |
| 114, |
| )), |
| size=( |
| 512, |
| 512, |
| ), |
| type='Pad'), |
| dict(type='PackDetInputs'), |
| ], |
| type='CocoDataset'), |
| num_workers=8, |
| persistent_workers=True, |
| sampler=dict(shuffle=True, type='DefaultSampler')) |
| train_pipeline = [ |
| dict(backend_args=None, type='LoadImageFromFile'), |
| dict(type='LoadAnnotations', with_bbox=True), |
| dict(keep_ratio=True, scale=( |
| 512, |
| 512, |
| ), type='Resize'), |
| dict(prob=0.5, type='RandomFlip'), |
| dict(pad_val=dict(img=( |
| 114, |
| 114, |
| 114, |
| )), size=( |
| 512, |
| 512, |
| ), type='Pad'), |
| dict(type='PackDetInputs'), |
| ] |
| val_cfg = dict(type='ValLoop') |
| val_dataloader = dict( |
| batch_size=1, |
| dataset=dict( |
| ann_file='annotations/instances_val2017.json', |
| backend_args=None, |
| data_prefix=dict(img='val2017/'), |
| data_root='data/coco/', |
| pipeline=[ |
| dict(backend_args=None, type='LoadImageFromFile'), |
| dict(keep_ratio=True, scale=( |
| 512, |
| 512, |
| ), type='Resize'), |
| dict( |
| pad_val=dict(img=( |
| 114, |
| 114, |
| 114, |
| )), |
| size=( |
| 512, |
| 512, |
| ), |
| type='Pad'), |
| dict(type='LoadAnnotations', with_bbox=True), |
| dict( |
| meta_keys=( |
| 'img_id', |
| 'img_path', |
| 'ori_shape', |
| 'img_shape', |
| 'scale_factor', |
| ), |
| type='PackDetInputs'), |
| ], |
| test_mode=True, |
| type='CocoDataset'), |
| drop_last=False, |
| num_workers=2, |
| persistent_workers=True, |
| sampler=dict(shuffle=False, type='DefaultSampler')) |
| val_evaluator = dict( |
| ann_file='data/coco/annotations/instances_val2017.json', |
| backend_args=None, |
| format_only=False, |
| metric='bbox', |
| type='CocoMetric') |
| vis_backends = [ |
| dict(type='LocalVisBackend'), |
| dict( |
| init_kwargs=dict( |
| name='faster_rcnn_rope_vit_tiny_coco_512', |
| project='vit-detection', |
| tags=[ |
| 'rope_vit', |
| 'coco', |
| 'faster_rcnn', |
| 'extrapolation', |
| ]), |
| type='WandbVisBackend'), |
| ] |
| visualizer = dict( |
| name='visualizer', |
| type='DetLocalVisualizer', |
| vis_backends=[ |
| dict(type='LocalVisBackend'), |
| dict( |
| init_kwargs=dict( |
| name='faster_rcnn_rope_vit_tiny_coco_512', |
| project='vit-detection', |
| tags=[ |
| 'rope_vit', |
| 'coco', |
| 'faster_rcnn', |
| 'extrapolation', |
| ]), |
| type='WandbVisBackend'), |
| ]) |
| work_dir = './work_dirs/faster_rcnn_rope_vit_tiny_coco' |
|
|