stpn

Browse files

Files changed (5) hide show

work_dirs/stpn_swint_adam_9x/20240204_030125.log +0 -0
work_dirs/stpn_swint_adam_9x/20240204_030125.log.json +0 -0
work_dirs/stpn_swint_adam_9x/epoch_9_model.pth +3 -0
work_dirs/stpn_swint_adam_9x/eval.txt +3 -0
work_dirs/stpn_swint_adam_9x/stpn_swint_adam_9x.py +438 -0

work_dirs/stpn_swint_adam_9x/20240204_030125.log ADDED Viewed

The diff for this file is too large to render. See raw diff

work_dirs/stpn_swint_adam_9x/20240204_030125.log.json ADDED Viewed

The diff for this file is too large to render. See raw diff

work_dirs/stpn_swint_adam_9x/epoch_9_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bcb39c4070df69ae917cd237f36cc9c77eec63a53ce94ae9b7a931aefdd27b7
+size 180353653

work_dirs/stpn_swint_adam_9x/eval.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+{'all': 0.8515168953546883, 'fast': 0.6405709450111929, 'medium': 0.8412701278128932, 'slow': 0.9141449563100874, 'airplane': 0.9586996519276003, 'antelope': 0.8799440834409841, 'bear': 0.8989029949927739, 'bicycle': 0.8851157502725769, 'bird': 0.7930013993678566, 'bus': 0.841979109569196, 'car': 0.7758164365133777, 'cattle': 0.802800559124309, 'dog': 0.8453745668140737, 'domestic_cat': 0.9140264245981315, 'elephant': 0.8546385510194372, 'fox':
+0.947818798999815, 'giant_panda': 0.8667739758302728, 'hamster': 0.9850564156153161, 'horse': 0.8874280101304849, 'lion': 0.7234216680206619, 'lizard': 0.8713093258061801, 'monkey': 0.6710894126913831, 'motorcycle': 0.9198253019671686, 'rabbit': 0.7994001086999526, 'red_panda': 0.8903292259476213, 'sheep': 0.7809233256814476, 'snake': 0.8029446576625736, 'squirrel': 0.6965247167195919, 'tiger': 0.9354936714466412, 'train': 0.8845634667175416, 'turtle': 0.81486794558347,
+'watercraft': 0.8363646138283387, 'whale': 0.8189654860172317, 'zebra': 0.9621072056346469}

work_dirs/stpn_swint_adam_9x/stpn_swint_adam_9x.py ADDED Viewed

	@@ -0,0 +1,438 @@

+checkpoint_config = dict(interval=9)
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+custom_hooks = [dict(type='NumClassCheckHook')]
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+optimizer = dict(
+    type='AdamW',
+    lr=2.5e-05,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            absolute_pos_embed=dict(decay_mult=0.0),
+            relative_position_bias_table=dict(decay_mult=0.0),
+            norm=dict(decay_mult=0.0))))
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.3333333333333333,
+    step=[6])
+runner = dict(type='EpochBasedRunner', max_epochs=9)
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
+is_video_model = True
+model = dict(
+    type='STPN',
+    detector=dict(
+        type='FasterRCNN',
+        backbone=dict(
+            type='STPNSwinTransformer',
+            embed_dims=96,
+            depths=[2, 2, 6, 2],
+            num_heads=[3, 6, 12, 24],
+            window_size=7,
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            drop_path_rate=0.2,
+            patch_norm=True,
+            with_cp=False,
+            convert_weights=True,
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint=
+                'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
+            ),
+            prompt_cfg=dict(
+                num_tokens=5,
+                location='prepend',
+                deep=False,
+                dropout=0.0,
+                initiation='random')),
+        neck=dict(
+            type='FPN',
+            in_channels=[96, 192, 384, 768],
+            out_channels=256,
+            num_outs=5),
+        rpn_head=dict(
+            type='RPNHead',
+            in_channels=256,
+            feat_channels=256,
+            anchor_generator=dict(
+                type='AnchorGenerator',
+                scales=[8],
+                ratios=[0.5, 1.0, 2.0],
+                strides=[4, 8, 16, 32, 64]),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(
+                type='SmoothL1Loss', beta=0.1111111111111111,
+                loss_weight=1.0)),
+        roi_head=dict(
+            type='StandardRoIHead',
+            bbox_roi_extractor=dict(
+                type='SingleRoIExtractor',
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=0),
+                out_channels=256,
+                featmap_strides=[4, 8, 16, 32]),
+            bbox_head=dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=30,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.2, 0.2, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(
+                    type='SmoothL1Loss',
+                    beta=0.1111111111111111,
+                    loss_weight=1.0))),
+        train_cfg=dict(
+            rpn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    match_low_quality=True,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False),
+            rpn_proposal=dict(
+                nms_pre=1000,
+                max_per_img=300,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=True,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)),
+        test_cfg=dict(
+            rpn=dict(
+                nms_pre=1000,
+                max_per_img=300,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                score_thr=0.0001,
+                nms=dict(type='nms', iou_threshold=0.5),
+                max_per_img=100,
+                mask_thr_binary=0.5))))
+dataset_type = 'ImagenetVIDDataset'
+data_root = 'data/ILSVRC/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile'),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[{
+            'type':
+            'SeqResize',
+            'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                          (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                          (736, 1333), (768, 1333), (800, 1333)],
+            'multiscale_mode':
+            'value',
+            'keep_ratio':
+            True
+        }],
+                  [{
+                      'type': 'SeqResize',
+                      'img_scale': [(400, 1333), (500, 1333), (600, 1333)],
+                      'multiscale_mode': 'value',
+                      'keep_ratio': True
+                  }, {
+                      'type': 'SeqRandomCrop',
+                      'crop_type': 'absolute_range',
+                      'crop_size': (384, 600),
+                      'allow_negative_crop': True
+                  }, {
+                      'type': 'SeqMaxSizePad'
+                  }, {
+                      'type':
+                      'SeqResize2',
+                      'img_scale': [(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                      'multiscale_mode':
+                      'value',
+                      'keep_ratio':
+                      True
+                  }]]),
+    dict(
+        type='SeqNormalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True),
+    dict(type='SeqPad', size_divisor=16),
+    dict(type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels']),
+    dict(type='ConcatVideoReferences'),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadMultiImagesFromFile'),
+    dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
+    dict(
+        type='SeqNormalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True),
+    dict(type='SeqPad', size_divisor=16),
+    dict(
+        type='VideoCollect',
+        keys=['img'],
+        meta_keys=('num_left_ref_imgs', 'frame_stride')),
+    dict(type='ConcatVideoReferences'),
+    dict(type='MultiImagesToTensor', ref_prefix='ref'),
+    dict(type='ToList')
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=[
+        dict(
+            type='ImagenetVIDDataset',
+            ann_file='data/ILSVRC/annotations/imagenet_vid_train.json',
+            img_prefix='data/ILSVRC/Data/VID',
+            ref_img_sampler=dict(
+                num_ref_imgs=2,
+                frame_range=9,
+                filter_key_img=True,
+                method='bilateral_uniform'),
+            pipeline=[
+                dict(type='LoadMultiImagesFromFile'),
+                dict(
+                    type='SeqLoadAnnotations', with_bbox=True,
+                    with_mask=False),
+                dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+                dict(
+                    type='AutoAugment',
+                    policies=[[{
+                        'type':
+                        'SeqResize',
+                        'img_scale': [(480, 1333), (512, 1333), (544, 1333),
+                                      (576, 1333), (608, 1333), (640, 1333),
+                                      (672, 1333), (704, 1333), (736, 1333),
+                                      (768, 1333), (800, 1333)],
+                        'multiscale_mode':
+                        'value',
+                        'keep_ratio':
+                        True
+                    }],
+                              [{
+                                  'type':
+                                  'SeqResize',
+                                  'img_scale': [(400, 1333), (500, 1333),
+                                                (600, 1333)],
+                                  'multiscale_mode':
+                                  'value',
+                                  'keep_ratio':
+                                  True
+                              }, {
+                                  'type': 'SeqRandomCrop',
+                                  'crop_type': 'absolute_range',
+                                  'crop_size': (384, 600),
+                                  'allow_negative_crop': True
+                              }, {
+                                  'type': 'SeqMaxSizePad'
+                              }, {
+                                  'type':
+                                  'SeqResize2',
+                                  'img_scale': [(480, 1333), (512, 1333),
+                                                (544, 1333), (576, 1333),
+                                                (608, 1333), (640, 1333),
+                                                (672, 1333), (704, 1333),
+                                                (736, 1333), (768, 1333),
+                                                (800, 1333)],
+                                  'multiscale_mode':
+                                  'value',
+                                  'keep_ratio':
+                                  True
+                              }]]),
+                dict(
+                    type='SeqNormalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+                dict(type='SeqPad', size_divisor=16),
+                dict(
+                    type='VideoCollect',
+                    keys=['img', 'gt_bboxes', 'gt_labels']),
+                dict(type='ConcatVideoReferences'),
+                dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+            ]),
+        dict(
+            type='ImagenetVIDDataset',
+            load_as_video=False,
+            ann_file='data/ILSVRC/annotations/imagenet_det_30plus1cls.json',
+            img_prefix='data/ILSVRC/Data/DET',
+            ref_img_sampler=dict(
+                num_ref_imgs=2,
+                frame_range=0,
+                filter_key_img=False,
+                method='bilateral_uniform'),
+            pipeline=[
+                dict(type='LoadMultiImagesFromFile'),
+                dict(
+                    type='SeqLoadAnnotations', with_bbox=True,
+                    with_mask=False),
+                dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+                dict(
+                    type='AutoAugment',
+                    policies=[[{
+                        'type':
+                        'SeqResize',
+                        'img_scale': [(480, 1333), (512, 1333), (544, 1333),
+                                      (576, 1333), (608, 1333), (640, 1333),
+                                      (672, 1333), (704, 1333), (736, 1333),
+                                      (768, 1333), (800, 1333)],
+                        'multiscale_mode':
+                        'value',
+                        'keep_ratio':
+                        True
+                    }],
+                              [{
+                                  'type':
+                                  'SeqResize',
+                                  'img_scale': [(400, 1333), (500, 1333),
+                                                (600, 1333)],
+                                  'multiscale_mode':
+                                  'value',
+                                  'keep_ratio':
+                                  True
+                              }, {
+                                  'type': 'SeqRandomCrop',
+                                  'crop_type': 'absolute_range',
+                                  'crop_size': (384, 600),
+                                  'allow_negative_crop': True
+                              }, {
+                                  'type': 'SeqMaxSizePad'
+                              }, {
+                                  'type':
+                                  'SeqResize2',
+                                  'img_scale': [(480, 1333), (512, 1333),
+                                                (544, 1333), (576, 1333),
+                                                (608, 1333), (640, 1333),
+                                                (672, 1333), (704, 1333),
+                                                (736, 1333), (768, 1333),
+                                                (800, 1333)],
+                                  'multiscale_mode':
+                                  'value',
+                                  'keep_ratio':
+                                  True
+                              }]]),
+                dict(
+                    type='SeqNormalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+                dict(type='SeqPad', size_divisor=16),
+                dict(
+                    type='VideoCollect',
+                    keys=['img', 'gt_bboxes', 'gt_labels']),
+                dict(type='ConcatVideoReferences'),
+                dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+            ])
+    ],
+    val=dict(
+        type='ImagenetVIDDataset',
+        ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
+        img_prefix='data/ILSVRC/Data/VID',
+        ref_img_sampler=dict(
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride'),
+        pipeline=[
+            dict(type='LoadMultiImagesFromFile'),
+            dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
+            dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
+            dict(
+                type='SeqNormalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='SeqPad', size_divisor=16),
+            dict(
+                type='VideoCollect',
+                keys=['img'],
+                meta_keys=('num_left_ref_imgs', 'frame_stride')),
+            dict(type='ConcatVideoReferences'),
+            dict(type='MultiImagesToTensor', ref_prefix='ref'),
+            dict(type='ToList')
+        ],
+        test_mode=True),
+    test=dict(
+        type='ImagenetVIDDataset',
+        ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
+        img_prefix='data/ILSVRC/Data/VID',
+        ref_img_sampler=dict(
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride'),
+        pipeline=[
+            dict(type='LoadMultiImagesFromFile'),
+            dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
+            dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
+            dict(
+                type='SeqNormalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='SeqPad', size_divisor=16),
+            dict(
+                type='VideoCollect',
+                keys=['img'],
+                meta_keys=('num_left_ref_imgs', 'frame_stride')),
+            dict(type='ConcatVideoReferences'),
+            dict(type='MultiImagesToTensor', ref_prefix='ref'),
+            dict(type='ToList')
+        ],
+        test_mode=True))
+total_epochs = 9
+evaluation = dict(metric=['bbox'], vid_style=True, interval=9)
+work_dir = './work_dirs/stpn_swint_adam_9x'
+gpu_ids = range(0, 8)