hanszhu
/

ChartPointNet-InstanceSeg

Image Segmentation

English

Model card Files Files and versions

xet

Community

hanszhu commited on Jul 30, 2025

Commit

c26f22f

verified ·

1 Parent(s): c598cec

Upload chart_pointnet_swin.py

Browse files

Files changed (1) hide show

chart_pointnet_swin.py +374 -0

chart_pointnet_swin.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# mask_rcnn_swin_meta.py - Mask R-CNN with Swin Transformer for data point segmentation
+#
+# ADAPTED FROM CASCADE R-CNN CONFIG:
+# - Uses same Swin Transformer Base backbone with optimizations
+# - Maintains data-point class weighting (10x) and IoU strategies
+# - Adds mask head for instance segmentation of data points
+# - Uses enhanced annotation files with segmentation masks
+# - Keeps custom hooks and progressive loss strategies
+#
+# MASK-SPECIFIC OPTIMIZATIONS:
+# - RoI size 14x14 for mask extraction (matches data point size)
+# - FCN mask head with 4 convolution layers
+# - Mask loss weight balanced with bbox and classification losses
+# - Enhanced test-time augmentation for better mask quality
+#
+# DATA POINT FOCUS:
+# - Primary target: data-point class (ID 11) with 10x weight
+# - Generates both bounding boxes AND instance masks
+# - Optimized for 16x16 pixel data points in scientific charts
+# Removed _base_ inheritance to avoid path issues - all configs are inlined below
+# Custom imports - same as Cascade R-CNN setup
+custom_imports = dict(
+    imports=[
+        'legend_match_swin.custom_models.register',
+        'legend_match_swin.custom_models.custom_hooks',
+        'legend_match_swin.custom_models.progressive_loss_hook',
+        'legend_match_swin.custom_models.flexible_load_annotations',
+    ],
+    allow_failed_imports=False
+)
+# Add to Python path
+import sys
+sys.path.insert(0, '.')
+# Mask R-CNN model with Swin Transformer backbone
+model = dict(
+    type='MaskRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        pad_mask=True,  # Important for mask training
+        mask_pad_value=0,
+    ),
+    # Same Swin Transformer Base backbone as Cascade R-CNN
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=128,  # Swin Base embedding dimensions
+        depths=[2, 2, 18, 2],  # Swin Base depths
+        num_heads=[4, 8, 16, 32],  # Swin Base attention heads
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.3,  # Same as Cascade config
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'
+        )
+    ),
+    # Same FPN as Cascade R-CNN
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],  # Swin Base: embed_dims * 2^(stage)
+        out_channels=256,
+        num_outs=5,  # Standard for Mask R-CNN (was 6 in Cascade)
+        start_level=0,
+        add_extra_convs='on_input'
+    ),
+    # Same RPN configuration as Cascade R-CNN
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[1, 2, 4, 8],  # Same small scales for tiny objects
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),  # Standard FPN strides for Mask R-CNN
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
+    ),
+    # Mask R-CNN ROI head with bbox + mask branches
+    roi_head=dict(
+        type='StandardRoIHead',
+        # Bbox ROI extractor (same as Cascade R-CNN final stage)
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]
+        ),
+        # Bbox head with data-point class weighting
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=22,  # 22 enhanced categories including boxplot
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]
+            ),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=[1.0,  # background class (index 0)
+                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                             10.0,  # data-point at index 12 gets 10x weight (11+1 for background)
+                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]  # Added boxplot class
+            ),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
+        ),
+        # Mask ROI extractor (optimized for 16x16 data points)
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=(14, 14), sampling_ratio=0, aligned=True),  # Force exact 14x14 with legacy alignment
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]
+        ),
+        # Mask head optimized for data points with square mask targets
+        mask_head=dict(
+            type='SquareFCNMaskHead',
+            num_convs=4,  # 4 conv layers for good feature extraction
+            in_channels=256,
+            roi_feat_size=14,  # Explicitly set ROI feature size
+            conv_out_channels=256,
+            num_classes=22,  # 22 enhanced categories including boxplot
+            upsample_cfg=dict(type=None),  # No upsampling - keep 14x14
+            loss_mask=dict(
+                type='CrossEntropyLoss',
+                use_mask=True,
+                loss_weight=1.0  # Balanced with bbox loss
+            )
+        )
+    ),
+    # Training configuration adapted from Cascade R-CNN
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        # RCNN training (using Cascade stage 2 settings - balanced for mask training)
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,  # Balanced IoU for bbox + mask training
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,  # Important for small data points
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=(14, 14),  # Force exact 14x14 size for data points
+            pos_weight=-1,
+            debug=False)
+    ),
+    # Test configuration with soft NMS
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.005,  # Low threshold to catch data points
+            nms=dict(
+                type='soft_nms',  # Soft NMS for better small object detection
+                iou_threshold=0.3,  # Low for data points
+                min_score=0.005,
+                method='gaussian',
+                sigma=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5  # Binary mask threshold
+        )
+    )
+)
+# Dataset settings - using standard COCO dataset for mask support
+dataset_type = 'CocoDataset'
+data_root = ''
+# 22 enhanced categories including boxplot
+CLASSES = (
+    'title', 'subtitle', 'x-axis', 'y-axis', 'x-axis-label', 'y-axis-label',        # 0-5
+    'x-tick-label', 'y-tick-label', 'legend', 'legend-title', 'legend-item',        # 6-10
+    'data-point', 'data-line', 'data-bar', 'data-area', 'grid-line',              # 11-15 (data-point at index 11)
+    'axis-title', 'tick-label', 'data-label', 'legend-text', 'plot-area',         # 16-20
+    'boxplot'                                                                      # 21
+)
+# Verify data-point class index
+assert CLASSES[11] == 'data-point', f"Expected 'data-point' at index 11 in CLASSES tuple, got '{CLASSES[11]}'"
+# Training dataloader with mask annotations
+train_dataloader = dict(
+    batch_size=2,  # Same as Cascade R-CNN
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_match_swin/mask_generation/enhanced_datasets/train_filtered_with_masks_only.json',
+        data_prefix=dict(img='legend_data/train/images/'),
+        metainfo=dict(classes=CLASSES),
+        filter_cfg=dict(filter_empty_gt=False, min_size=12),  # Don't filter out images with masks
+        # Disable any built-in filtering that might remove annotations
+        test_mode=False,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='FlexibleLoadAnnotations', with_bbox=True, with_mask=True),
+            dict(type='Resize', scale=(1120, 672), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='ClampBBoxes'),
+            dict(type='PackDetInputs')
+        ]
+    )
+)
+# Validation dataloader with mask annotations
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_match_swin/mask_generation/enhanced_datasets/val_enriched_with_masks_only.json',
+        data_prefix=dict(img='legend_data/train/images/'),
+        metainfo=dict(classes=CLASSES),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(1120, 672), keep_ratio=True),
+            dict(type='FlexibleLoadAnnotations', with_bbox=True, with_mask=True),
+            dict(type='ClampBBoxes'),
+            dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor'))
+        ]
+    )
+)
+test_dataloader = val_dataloader
+# Enhanced evaluators for both bbox and mask metrics
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='legend_match_swin/mask_generation/enhanced_datasets/val_enriched_with_masks_only.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    classwise=True,
+    proposal_nums=(100, 300, 1000)
+)
+test_evaluator = val_evaluator
+# Same custom hooks as Cascade R-CNN
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CompatibleCheckpointHook', interval=1, save_best='auto', max_keep_ckpts=3),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook')
+)
+# Same custom hooks as Cascade R-CNN (adapted for Mask R-CNN)
+custom_hooks = [
+    dict(type='SkipBadSamplesHook', interval=1),
+    dict(type='ChartTypeDistributionHook', interval=500),
+    dict(type='MissingImageReportHook', interval=1000),
+    dict(type='NanRecoveryHook',
+         fallback_loss=1.0,
+         max_consecutive_nans=50,
+         log_interval=25),
+    # Note: Progressive loss hook not used in standard Mask R-CNN
+    # but could be adapted if needed for bbox loss only
+]
+# Training configuration - reduced to 20 epochs
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# Same optimizer settings as Cascade R-CNN
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10.0, norm_type=2)
+)
+# Same learning rate schedule as Cascade R-CNN
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        T_max=20,
+        eta_min=1e-5,
+        convert_to_iter_based=True)
+]
+# Work directory
+work_dir = '/content/drive/MyDrive/Research Summer 2025/Dense Captioning Toolkit/CHART-DeMatch/work_dirs/mask_rcnn_swin_base_20ep_meta'
+# Fresh start
+resume = False
+load_from = None
+# Default runtime settings (normally inherited from _base_)
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'