model = dict( type="CascadeRCNN", backbone=dict( type="SwinTransformer", embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False, convert_weights=True, init_cfg=dict( type="Pretrained", checkpoint="https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth", ), ), neck=dict( type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5 ), rpn_head=dict( type="RPNHead", in_channels=256, feat_channels=256, anchor_generator=dict( type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64], ), bbox_coder=dict( type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0], ), loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type="SmoothL1Loss", beta=0.1111111111111111, loss_weight=1.0), ), roi_head=dict( type="CascadeRoIHead_LGF", num_stages=3, stage_loss_weights=[1, 1, 0.5], bbox_roi_extractor=dict( type="SingleRoIExtractor", roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32], ), bbox_head=[ dict( type="Shared3FCBBoxHead_with_BboxEncoding", in_channels=256, fc_out_channels=1024, bbox_encoding_dim=512, roi_feat_size=7, num_classes=18, bbox_coder=dict( type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2], ), reg_class_agnostic=True, loss_cls=dict(type="FocalLoss"), loss_bbox=dict(type="BalancedL1Loss", beta=1.0, loss_weight=1.0), ), dict( type="Shared3FCBBoxHead_with_BboxEncoding", in_channels=256, fc_out_channels=1024, bbox_encoding_dim=512, roi_feat_size=7, num_classes=18, bbox_coder=dict( type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.05, 0.05, 0.1, 0.1], ), reg_class_agnostic=True, loss_cls=dict(type="FocalLoss"), loss_bbox=dict(type="BalancedL1Loss", beta=1.0, loss_weight=1.0), ), dict( type="Shared3FCBBoxHead_with_BboxEncoding", in_channels=256, fc_out_channels=1024, bbox_encoding_dim=512, roi_feat_size=7, num_classes=18, bbox_coder=dict( type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.033, 0.033, 0.067, 0.067], ), reg_class_agnostic=True, loss_cls=dict(type="FocalLoss"), loss_bbox=dict(type="BalancedL1Loss", beta=1.0, loss_weight=1.0), ), ], localglobal_fuser=dict( type="LocalGlobal_Context_Fuser", channels=256, roi_size=7, reduced_channels=256, lg_merge_layer=dict(type="SELayer", channels=256), ), lgf_shared=False, bbox_encoder=dict( type="BboxEncoder", n_layer=4, n_head=4, n_embd=512, bbox_cord_dim=4, bbox_max_num=1024, embd_pdrop=0.1, attn_pdrop=0.1, ), bbox_encoder_shared=False, ), train_cfg=dict( rpn=dict( assigner=dict( type="MaxIoUAssigner", pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1, ), sampler=dict( type="RandomSampler", num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False, ), allowed_border=0, pos_weight=-1, debug=False, ), rpn_proposal=dict( nms_pre=2000, max_per_img=2000, nms=dict(type="nms", iou_threshold=0.7), min_bbox_size=0, ), rcnn=[ dict( assigner=dict( type="MaxIoUAssigner", pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1, ), sampler=dict( type="RandomSampler", num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True, ), pos_weight=-1, debug=False, ), dict( assigner=dict( type="MaxIoUAssigner", pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1, ), sampler=dict( type="RandomSampler", num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True, ), pos_weight=-1, debug=False, ), dict( assigner=dict( type="MaxIoUAssigner", pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1, ), sampler=dict( type="RandomSampler", num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True, ), pos_weight=-1, debug=False, ), ], ), test_cfg=dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type="nms", iou_threshold=0.7), min_bbox_size=0, ), rcnn=dict( score_thr=0.0, nms=dict(type="nms", iou_threshold=0.7), max_per_img=200 ), ), ) dataset_type = "CocoDataset" data_root = "data/coco/" img_norm_cfg = dict( mean=[216.45, 212.36, 206.76], std=[55.82, 56.04, 55.56], to_rgb=True ) train_pipeline = [ dict(type="LoadImageFromFile"), dict(type="LoadAnnotations", with_bbox=True), dict( type="AutoAugment", policies=[ [ { "type": "Resize", "img_scale": [ (480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333), ], "multiscale_mode": "value", "keep_ratio": True, } ], [ { "type": "Resize", "img_scale": [(400, 1333), (500, 1333), (600, 1333)], "multiscale_mode": "value", "keep_ratio": True, }, { "type": "RandomCrop", "crop_type": "absolute_range", "crop_size": (384, 600), "allow_negative_crop": True, }, { "type": "Resize", "img_scale": [ (480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333), ], "multiscale_mode": "value", "override": True, "keep_ratio": True, }, { "type": "PhotoMetricDistortion", "brightness_delta": 32, "contrast_range": (0.5, 1.5), "saturation_range": (0.5, 1.5), "hue_delta": 18, }, { "type": "MinIoURandomCrop", "min_ious": (0.4, 0.5, 0.6, 0.7, 0.8, 0.9), "min_crop_size": 0.3, }, { "type": "CutOut", "n_holes": (5, 10), "cutout_shape": [ (4, 4), (4, 8), (8, 4), (8, 8), (16, 32), (32, 16), (32, 32), (32, 48), (48, 32), (48, 48), ], }, ], ], ), dict(type="RandomFlip", flip_ratio=0.1), dict( type="Normalize", mean=[216.45, 212.36, 206.76], std=[55.82, 56.04, 55.56], to_rgb=True, ), dict(type="Pad", size_divisor=32), dict(type="DefaultFormatBundle"), dict(type="Collect", keys=["img", "gt_bboxes", "gt_labels"]), ] test_pipeline = [ dict(type="LoadImageFromFile", to_float32=True), dict( type="MultiScaleFlipAug", img_scale=(1333, 800), flip=False, transforms=[ dict(type="Resize", keep_ratio=True), dict(type="RandomFlip", flip_ratio=0.0), dict( type="Normalize", mean=[216.45, 212.36, 206.76], std=[55.82, 56.04, 55.56], to_rgb=True, ), dict(type="Pad", size_divisor=32), dict(type="DefaultFormatBundle"), dict(type="Collect", keys=["img"]), ], ), ] data = dict( samples_per_gpu=3, workers_per_gpu=4, train=dict( type="CocoDataset", ann_file="./data/pmc_2022/pmc_coco/element_detection/train.json", img_prefix="./data/pmc_2022/pmc_coco/element_detection/train/", pipeline=[ dict(type="LoadImageFromFile"), dict(type="LoadAnnotations", with_bbox=True), dict( type="AutoAugment", policies=[ [ { "type": "Resize", "img_scale": [ (480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333), ], "multiscale_mode": "value", "keep_ratio": True, } ], [ { "type": "Resize", "img_scale": [(400, 1333), (500, 1333), (600, 1333)], "multiscale_mode": "value", "keep_ratio": True, }, { "type": "RandomCrop", "crop_type": "absolute_range", "crop_size": (384, 600), "allow_negative_crop": True, }, { "type": "Resize", "img_scale": [ (480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333), ], "multiscale_mode": "value", "override": True, "keep_ratio": True, }, { "type": "PhotoMetricDistortion", "brightness_delta": 32, "contrast_range": (0.5, 1.5), "saturation_range": (0.5, 1.5), "hue_delta": 18, }, { "type": "MinIoURandomCrop", "min_ious": (0.4, 0.5, 0.6, 0.7, 0.8, 0.9), "min_crop_size": 0.3, }, { "type": "CutOut", "n_holes": (5, 10), "cutout_shape": [ (4, 4), (4, 8), (8, 4), (8, 8), (16, 32), (32, 16), (32, 32), (32, 48), (48, 32), (48, 48), ], }, ], ], ), dict(type="RandomFlip", flip_ratio=0.1), dict( type="Normalize", mean=[216.45, 212.36, 206.76], std=[55.82, 56.04, 55.56], to_rgb=True, ), dict(type="Pad", size_divisor=32), dict(type="DefaultFormatBundle"), dict(type="Collect", keys=["img", "gt_bboxes", "gt_labels"]), ], classes=[ "x_title", "y_title", "plot_area", "other", "xlabel", "ylabel", "chart_title", "x_tick", "y_tick", "legend_patch", "legend_label", "legend_title", "legend_area", "mark_label", "value_label", "y_axis_area", "x_axis_area", "tick_grouping", ], ), val=dict( type="CocoDataset", ann_file="./data/pmc_2022/pmc_coco/element_detection/val.json", img_prefix="./data/pmc_2022/pmc_coco/element_detection/val/", pipeline=[ dict(type="LoadImageFromFile"), dict( type="MultiScaleFlipAug", img_scale=(1333, 800), flip=False, transforms=[ dict(type="Resize", keep_ratio=True), dict(type="RandomFlip"), dict( type="Normalize", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True, ), dict(type="Pad", size_divisor=32), dict(type="ImageToTensor", keys=["img"]), dict(type="Collect", keys=["img"]), ], ), ], classes=[ "x_title", "y_title", "plot_area", "other", "xlabel", "ylabel", "chart_title", "x_tick", "y_tick", "legend_patch", "legend_label", "legend_title", "legend_area", "mark_label", "value_label", "y_axis_area", "x_axis_area", "tick_grouping", ], ), test=dict( type="CocoDataset", ann_file="./data/pmc_2022/pmc_coco/element_detection/split3_test.json", img_prefix="./data/pmc_2022/pmc_coco/element_detection/split3_test/", pipeline=[ dict(type="LoadImageFromFile"), dict( type="MultiScaleFlipAug", img_scale=(1333, 800), flip=False, transforms=[ dict(type="Resize", keep_ratio=True), dict(type="RandomFlip"), dict( type="Normalize", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True, ), dict(type="Pad", size_divisor=32), dict(type="ImageToTensor", keys=["img"]), dict(type="Collect", keys=["img"]), ], ), ], classes=[ "x_title", "y_title", "plot_area", "other", "xlabel", "ylabel", "chart_title", "x_tick", "y_tick", "legend_patch", "legend_label", "legend_title", "legend_area", "mark_label", "value_label", "y_axis_area", "x_axis_area", "tick_grouping", ], ), ) evaluation = dict(interval=1, metric=["bbox"]) optimizer = dict( type="AdamW", lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05, paramwise_cfg=dict( custom_keys=dict( absolute_pos_embed=dict(decay_mult=0.0), relative_position_bias_table=dict(decay_mult=0.0), norm=dict(decay_mult=0.0), ) ), ) optimizer_config = dict(grad_clip=None) lr_config = dict( policy="step", warmup="linear", warmup_iters=500, warmup_ratio=0.001, step=[8, 11] ) runner = dict(type="EpochBasedRunner", max_epochs=150) checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type="TextLoggerHook")]) custom_hooks = [dict(type="NumClassCheckHook")] dist_params = dict(backend="nccl") log_level = "INFO" load_from = None resume_from = None workflow = [("train", 1)] opencv_num_threads = 0 mp_start_method = "fork" auto_scale_lr = dict(enable=False, base_batch_size=16) pretrained = "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth" classes = [ "x_title", "y_title", "plot_area", "other", "xlabel", "ylabel", "chart_title", "x_tick", "y_tick", "legend_patch", "legend_label", "legend_title", "legend_area", "mark_label", "value_label", "y_axis_area", "x_axis_area", "tick_grouping", ] auto_resume = False gpu_ids = range(0, 4)