Dmmm997 commited on Sep 5, 2025

Commit

a482a69

verified ·

1 Parent(s): 8c86618

Upload 21 files

Browse files

Files changed (21) hide show

grefcoco/PropVG-grefcoco.pth +3 -0
grefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx +0 -0
grefcoco/test_log.txt +331 -0
refcoco+/PropVG-refcoco+.pth +3 -0
refcoco+/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx +0 -0
refcoco+/test_log.txt +335 -0
refcoco-mix/PropVG-refcoco-mix.pth +3 -0
refcoco-mix/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx +0 -0
refcoco-mix/test_log.txt +540 -0
refcoco/PropVG-refcoco.pth +3 -0
refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx +0 -0
refcoco/test_log.txt +335 -0
refcocog/PropVG-refcocog.pth +3 -0
refcocog/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx +0 -0
refcocog/test_log.txt +294 -0
refzom/PropVG-refzom.pth +3 -0
refzom/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx +0 -0
refzom/test_log.txt +240 -0
rrefcoco/PropVG-rrefcoco.pth +3 -0
rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx +0 -0
rrefcoco/test_log.txt +314 -0

grefcoco/PropVG-grefcoco.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:410cf0998478247598391597dc0da8f287079ade292257380e352d2dc4b64084
+size 987093029

grefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx ADDED Viewed

Binary file (5.16 kB). View file

grefcoco/test_log.txt ADDED Viewed

	@@ -0,0 +1,331 @@

+2025-07-07 10:57:14,028 - PropVG - INFO - dataset = 'GRefCOCO'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='GRefCOCO',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox', 'gt_mask_parts_rle'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index', 'tokenized_words'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='GRefCOCO',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox', 'gt_mask_parts_rle'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index', 'tokenized_words'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='GRefCOCO',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox', 'gt_mask_parts_rle'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index', 'tokenized_words'
+        ])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='GRefCOCO',
+        which_set='train',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/grefs/instances.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='GRefCOCO',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/grefs/coco_annotations.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val=dict(
+        type='GRefCOCO',
+        which_set='val',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/grefs/instances.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='GRefCOCO',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/grefs/coco_annotations.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testA=dict(
+        type='GRefCOCO',
+        which_set='testA',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/grefs/instances.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='GRefCOCO',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/grefs/coco_annotations.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testB=dict(
+        type='GRefCOCO',
+        which_set='testB',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/grefs/instances.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='GRefCOCO',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/grefs/coco_annotations.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/gres/PropVG-grefcoco.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 7
+max_token = 50
+img_size = 320
+patch_size = 16
+model = dict(
+    type='MIXGrefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=320,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='GTMHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=250)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=False,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[7, 11],
+    decay_ratio=0.1,
+    max_epoch=12)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 4
+2025-07-07 10:57:25,861 - PropVG - INFO - GRefCOCO-val size: 16870
+2025-07-07 10:57:37,626 - PropVG - INFO - GRefCOCO-testA size: 18712
+2025-07-07 10:57:49,703 - PropVG - INFO - GRefCOCO-testB size: 14933
+2025-07-07 10:57:55,300 - PropVG - INFO - loaded checkpoint from work_dir/gres/PropVG-grefcoco.pth
+2025-07-07 10:57:55,323 - PropVG - INFO - PropVG - evaluating set val
+2025-07-07 10:59:51,470 - PropVG - INFO - ------------ validate ------------  time: 116.14, F1score: 72.16, Nacc: 72.83, Tacc: 96.93, gIoU: 73.29, cIoU: 69.23, MaskACC@0.7-0.9: [74.74, 60.99, 23.42
+2025-07-07 10:59:52,918 - PropVG - INFO - PropVG - evaluating set testA
+2025-07-07 11:01:57,887 - PropVG - INFO - ------------ validate ------------  time: 124.96, F1score: 68.77, Nacc: 69.87, Tacc: 96.56, gIoU: 74.43, cIoU: 74.20, MaskACC@0.7-0.9: [77.48, 65.93, 30.06
+2025-07-07 11:01:59,563 - PropVG - INFO - PropVG - evaluating set testB
+2025-07-07 11:03:41,160 - PropVG - INFO - ------------ validate ------------  time: 101.59, F1score: 59.02, Nacc: 64.97, Tacc: 91.68, gIoU: 65.87, cIoU: 64.76, MaskACC@0.7-0.9: [62.03, 51.61, 28.43
+2025-07-07 11:03:42,844 - PropVG - INFO - sucessfully save the results to work_dir/gres/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx !!!

refcoco+/PropVG-refcoco+.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15fdea912cd5ac8722ff2a12954cc621ff21bea2e19bae900191f63419ea335e
+size 987633701

refcoco+/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED Viewed

Binary file (5.2 kB). View file

refcoco+/test_log.txt ADDED Viewed

	@@ -0,0 +1,335 @@

+2025-07-07 11:09:02,802 - PropVG - INFO - dataset = 'RefCOCOPlusUNC'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOPlusUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOPlusUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOPlusUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RefCOCOPlusUNC',
+        which_set='train',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOPlusUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val=dict(
+        type='RefCOCOPlusUNC',
+        which_set='val',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOPlusUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testA=dict(
+        type='RefCOCOPlusUNC',
+        which_set='testA',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOPlusUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testB=dict(
+        type='RefCOCOPlusUNC',
+        which_set='testB',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOPlusUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/refcoco+/PropVG-refcoco+.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 20
+max_token = 20
+img_size = 384
+patch_size = 16
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=384,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='REFHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=100)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=True,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[21, 27],
+    decay_ratio=0.1,
+    max_epoch=30)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 4
+2025-07-07 11:09:07,978 - PropVG - INFO - RefCOCOPlusUNC-val size: 10758
+2025-07-07 11:09:13,867 - PropVG - INFO - RefCOCOPlusUNC-testA size: 5726
+2025-07-07 11:09:19,990 - PropVG - INFO - RefCOCOPlusUNC-testB size: 4889
+2025-07-07 11:09:24,879 - PropVG - INFO - loaded checkpoint from work_dir/refcoco+/PropVG-refcoco+.pth
+2025-07-07 11:09:24,886 - PropVG - INFO - PropVG - evaluating set val
+2025-07-07 11:11:17,140 - PropVG - INFO - ------------ validate ------------  time: 112.25, DetACC: 83.73, mIoU: 72.94, oIoU: 70.24, MaskACC@0.5-0.9: [83.12, 80.60, 76.04,  65.37,  33.26]DetACC@0.5-0.9: [83.73, 81.30, 77.10,  68.58,  42.65]
+2025-07-07 11:11:18,910 - PropVG - INFO - PropVG - evaluating set testA
+2025-07-07 11:12:32,835 - PropVG - INFO - ------------ validate ------------  time: 73.92, DetACC: 88.01, mIoU: 76.49, oIoU: 74.32, MaskACC@0.5-0.9: [88.04, 86.00, 81.37,  70.53,  33.52]DetACC@0.5-0.9: [88.01, 85.91, 82.12,  73.80,  47.14]
+2025-07-07 11:12:34,541 - PropVG - INFO - PropVG - evaluating set testB
+2025-07-07 11:13:39,576 - PropVG - INFO - ------------ validate ------------  time: 65.03, DetACC: 76.59, mIoU: 67.21, oIoU: 63.41, MaskACC@0.5-0.9: [75.57, 71.83, 66.95,  57.38,  33.87]DetACC@0.5-0.9: [76.59, 73.26, 68.11,  59.24,  36.12]
+2025-07-07 11:13:41,507 - PropVG - INFO - sucessfully save the results to work_dir/refcoco+/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!

refcoco-mix/PropVG-refcoco-mix.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78ae3b32e6ea3c4bbfc84faaa46e50f80c3076175d2b8c346497e19bcd2fffd9
+size 987636053

refcoco-mix/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED Viewed

Binary file (5.55 kB). View file

refcoco-mix/test_log.txt ADDED Viewed

	@@ -0,0 +1,540 @@

+2025-07-07 11:27:50,676 - PropVG - INFO - dataset = 'MixedSeg'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='MixedSeg',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='MixedSeg',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='MixedSeg',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='MixedSeg',
+        which_set='train',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_refcoco_unc=dict(
+        type='MixedSeg',
+        which_set='val_refcoco_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testA_refcoco_unc=dict(
+        type='MixedSeg',
+        which_set='testA_refcoco_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testB_refcoco_unc=dict(
+        type='MixedSeg',
+        which_set='testB_refcoco_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_refcocoplus_unc=dict(
+        type='MixedSeg',
+        which_set='val_refcocoplus_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testA_refcocoplus_unc=dict(
+        type='MixedSeg',
+        which_set='testA_refcocoplus_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testB_refcocoplus_unc=dict(
+        type='MixedSeg',
+        which_set='testB_refcocoplus_unc',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_refcocog_umd=dict(
+        type='MixedSeg',
+        which_set='val_refcocog_umd',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    test_refcocog_umd=dict(
+        type='MixedSeg',
+        which_set='test_refcocog_umd',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='MixedSeg',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/refcoco-mix/PropVG-refcoco-mix.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 20
+max_token = 20
+img_size = 384
+patch_size = 16
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=384,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='REFHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=100)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=False,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[21, 27],
+    decay_ratio=0.1,
+    max_epoch=30)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 1
+2025-07-07 11:27:58,403 - PropVG - INFO - Mixed-val_refcoco_unc size: 10834
+2025-07-07 11:28:06,594 - PropVG - INFO - Mixed-testA_refcoco_unc size: 5657
+2025-07-07 11:28:15,164 - PropVG - INFO - Mixed-testB_refcoco_unc size: 5095
+2025-07-07 11:28:23,677 - PropVG - INFO - Mixed-val_refcocoplus_unc size: 10758
+2025-07-07 11:28:30,907 - PropVG - INFO - Mixed-testA_refcocoplus_unc size: 5726
+2025-07-07 11:28:38,494 - PropVG - INFO - Mixed-testB_refcocoplus_unc size: 4889
+2025-07-07 11:28:49,090 - PropVG - INFO - Mixed-val_refcocog_umd size: 4896
+2025-07-07 11:28:54,576 - PropVG - INFO - Mixed-test_refcocog_umd size: 9602
+2025-07-07 11:29:02,664 - PropVG - INFO - loaded checkpoint from work_dir/refcoco-mix/PropVG-refcoco-mix.pth
+2025-07-07 11:29:02,665 - PropVG - INFO - PropVG - evaluating set val_refcoco_unc
+2025-07-07 11:32:39,213 - PropVG - INFO - ------------ validate ------------  time: 216.54, DetACC: 92.70, mIoU: 81.96, oIoU: 81.80, MaskACC@0.5-0.9: [92.24, 90.71, 87.59,  79.79,  46.59]DetACC@0.5-0.9: [92.70, 91.43, 88.90,  83.85,  66.30]
+2025-07-07 11:32:43,474 - PropVG - INFO - PropVG - evaluating set testA_refcoco_unc
+2025-07-07 11:34:47,838 - PropVG - INFO - ------------ validate ------------  time: 124.36, DetACC: 95.07, mIoU: 83.58, oIoU: 83.74, MaskACC@0.5-0.9: [94.56, 93.48, 90.93,  82.91,  46.61]DetACC@0.5-0.9: [95.07, 93.99, 92.17,  88.17,  69.29]
+2025-07-07 11:34:53,297 - PropVG - INFO - PropVG - evaluating set testB_refcoco_unc
+2025-07-07 11:36:51,290 - PropVG - INFO - ------------ validate ------------  time: 117.99, DetACC: 89.58, mIoU: 80.02, oIoU: 79.33, MaskACC@0.5-0.9: [89.19, 86.99, 83.45,  76.76,  51.07]DetACC@0.5-0.9: [89.58, 87.56, 84.61,  79.14,  61.83]
+2025-07-07 11:36:56,652 - PropVG - INFO - PropVG - evaluating set val_refcocoplus_unc
+2025-07-07 11:40:28,540 - PropVG - INFO - ------------ validate ------------  time: 211.88, DetACC: 87.27, mIoU: 77.14, oIoU: 74.81, MaskACC@0.5-0.9: [86.67, 85.36, 82.52,  75.28,  44.34]DetACC@0.5-0.9: [87.27, 86.30, 84.09,  79.64,  63.62]
+2025-07-07 11:40:36,392 - PropVG - INFO - PropVG - evaluating set testA_refcocoplus_unc
+2025-07-07 11:42:43,800 - PropVG - INFO - ------------ validate ------------  time: 127.40, DetACC: 90.87, mIoU: 79.83, oIoU: 78.72, MaskACC@0.5-0.9: [90.13, 88.79, 86.57,  79.46,  45.04]DetACC@0.5-0.9: [90.87, 89.82, 87.81,  83.92,  66.33]
+2025-07-07 11:42:48,169 - PropVG - INFO - PropVG - evaluating set testB_refcocoplus_unc
+2025-07-07 11:44:41,261 - PropVG - INFO - ------------ validate ------------  time: 113.09, DetACC: 81.26, mIoU: 72.18, oIoU: 69.15, MaskACC@0.5-0.9: [80.18, 78.20, 74.78,  68.68,  45.88]DetACC@0.5-0.9: [81.26, 79.40, 76.95,  72.20,  56.78]
+2025-07-07 11:44:45,751 - PropVG - INFO - PropVG - evaluating set val_refcocog_umd
+2025-07-07 11:46:42,173 - PropVG - INFO - ------------ validate ------------  time: 116.42, DetACC: 88.15, mIoU: 76.97, oIoU: 75.54, MaskACC@0.5-0.9: [86.17, 83.58, 79.43,  72.16,  44.87]DetACC@0.5-0.9: [88.15, 85.97, 82.90,  78.00,  63.09]
+2025-07-07 11:46:46,257 - PropVG - INFO - PropVG - evaluating set test_refcocog_umd
+2025-07-07 11:50:06,821 - PropVG - INFO - ------------ validate ------------  time: 200.56, DetACC: 88.30, mIoU: 77.72, oIoU: 77.40, MaskACC@0.5-0.9: [87.14, 85.01, 80.84,  72.78,  45.79]DetACC@0.5-0.9: [88.30, 86.71, 83.98,  79.07,  65.00]
+2025-07-07 11:50:11,168 - PropVG - INFO - sucessfully save the results to work_dir/refcoco-mix/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!

refcoco/PropVG-refcoco.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd95a1952b4ac17c234432463e39e3eca42802ffdbbffcc787ea2034c5f1ac5b
+size 987632917

refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED Viewed

Binary file (5.19 kB). View file

refcoco/test_log.txt ADDED Viewed

	@@ -0,0 +1,335 @@

+2025-07-07 11:04:05,796 - PropVG - INFO - dataset = 'RefCOCOUNC'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOUNC',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RefCOCOUNC',
+        which_set='train',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val=dict(
+        type='RefCOCOUNC',
+        which_set='val',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testA=dict(
+        type='RefCOCOUNC',
+        which_set='testA',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    testB=dict(
+        type='RefCOCOUNC',
+        which_set='testB',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOUNC',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/refcoco/PropVG-refcoco.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 20
+max_token = 20
+img_size = 384
+patch_size = 16
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=384,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='REFHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=100)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=True,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[21, 27],
+    decay_ratio=0.1,
+    max_epoch=30)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 4
+2025-07-07 11:04:11,542 - PropVG - INFO - RefCOCOUNC-val size: 10834
+2025-07-07 11:04:17,084 - PropVG - INFO - RefCOCOUNC-testA size: 5657
+2025-07-07 11:04:22,843 - PropVG - INFO - RefCOCOUNC-testB size: 5095
+2025-07-07 11:04:28,381 - PropVG - INFO - loaded checkpoint from work_dir/refcoco/PropVG-refcoco.pth
+2025-07-07 11:04:28,382 - PropVG - INFO - PropVG - evaluating set val
+2025-07-07 11:06:19,535 - PropVG - INFO - ------------ validate ------------  time: 111.15, DetACC: 88.95, mIoU: 77.98, oIoU: 76.79, MaskACC@0.5-0.9: [89.14, 86.33, 81.66,  70.75,  36.42]DetACC@0.5-0.9: [88.95, 86.66, 82.62,  73.65,  47.82]
+2025-07-07 11:06:21,284 - PropVG - INFO - PropVG - evaluating set testA
+2025-07-07 11:07:29,418 - PropVG - INFO - ------------ validate ------------  time: 68.13, DetACC: 91.55, mIoU: 79.81, oIoU: 79.57, MaskACC@0.5-0.9: [91.66, 89.84, 85.42,  73.96,  36.22]DetACC@0.5-0.9: [91.55, 89.95, 85.94,  77.69,  51.57]
+2025-07-07 11:07:30,844 - PropVG - INFO - PropVG - evaluating set testB
+2025-07-07 11:08:36,434 - PropVG - INFO - ------------ validate ------------  time: 65.59, DetACC: 85.73, mIoU: 75.28, oIoU: 73.68, MaskACC@0.5-0.9: [84.95, 81.26, 76.06,  65.64,  38.81]DetACC@0.5-0.9: [85.73, 82.06, 76.20,  66.88,  41.97]
+2025-07-07 11:08:37,918 - PropVG - INFO - sucessfully save the results to work_dir/refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!

refcocog/PropVG-refcocog.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:004695f51a341fea17b4e7a7ff1186ada40745ca5d794b92f1adb4f6f55e9b76
+size 987633701

refcocog/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED Viewed

Binary file (5.12 kB). View file

refcocog/test_log.txt ADDED Viewed

	@@ -0,0 +1,294 @@

+2025-07-07 11:14:04,236 - PropVG - INFO - dataset = 'RefCOCOgUMD'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOgUMD',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOgUMD',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile_TO',
+        max_token=20,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefCOCOgUMD',
+        use_token_type='beit3',
+        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RefCOCOgUMD',
+        which_set='train',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOgUMD',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val=dict(
+        type='RefCOCOgUMD',
+        which_set='val',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOgUMD',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    test=dict(
+        type='RefCOCOgUMD',
+        which_set='test',
+        img_source=['coco'],
+        annsfile=
+        './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFile_TO',
+                max_token=20,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefCOCOgUMD',
+                use_token_type='beit3',
+                refer_file=
+                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/refcocog/PropVG-refcocog.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 20
+max_token = 20
+img_size = 384
+patch_size = 16
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=384,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='REFHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=100)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=True,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[21, 27],
+    decay_ratio=0.1,
+    max_epoch=30)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 1
+2025-07-07 11:14:09,303 - PropVG - INFO - RefCOCOg-val size: 4896
+2025-07-07 11:14:14,811 - PropVG - INFO - RefCOCOg-test size: 9602
+2025-07-07 11:14:19,468 - PropVG - INFO - loaded checkpoint from work_dir/refcocog/PropVG-refcocog.pth
+2025-07-07 11:14:19,479 - PropVG - INFO - PropVG - evaluating set val
+2025-07-07 11:16:13,025 - PropVG - INFO - ------------ validate ------------  time: 113.54, DetACC: 83.50, mIoU: 71.34, oIoU: 69.30, MaskACC@0.5-0.9: [81.19, 77.33, 71.51,  60.15,  30.78]DetACC@0.5-0.9: [83.50, 80.09, 75.41,  66.07,  40.54]
+2025-07-07 11:16:15,090 - PropVG - INFO - PropVG - evaluating set test
+2025-07-07 11:19:29,251 - PropVG - INFO - ------------ validate ------------  time: 194.16, DetACC: 84.44, mIoU: 72.10, oIoU: 70.53, MaskACC@0.5-0.9: [82.53, 78.47, 72.66,  61.23,  30.31]DetACC@0.5-0.9: [84.44, 81.32, 76.33,  67.14,  42.69]
+2025-07-07 11:19:31,176 - PropVG - INFO - sucessfully save the results to work_dir/refcocog/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!

refzom/PropVG-refzom.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4afe5112bf3f560532da5783f483bc286bfe5cf757035945b2928cebd696231e
+size 987091461

refzom/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED Viewed

Binary file (5.01 kB). View file

refzom/test_log.txt ADDED Viewed

	@@ -0,0 +1,240 @@

+2025-07-07 11:19:47,247 - PropVG - INFO - dataset = 'RefZOM'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefZOM',
+        use_token_type='beit3',
+        refer_file=
+        '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RefZOM',
+        use_token_type='beit3',
+        refer_file=
+        '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile',
+        max_token=20,
+        with_bbox=True,
+        dataset='RefZOM'),
+    dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='CollectData', keys=['img', 'ref_expr_inds', 'gt_bbox'])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RefZOM',
+        which_set='train',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/ref-zom/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/trainval2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefZOM',
+                use_token_type='beit3',
+                refer_file=
+                '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val=dict(
+        type='RefZOM',
+        which_set='test',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/ref-zom/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/trainval2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RefZOM',
+                use_token_type='beit3',
+                refer_file=
+                '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/refzom/PropVG-refzom.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 9
+max_token = 50
+img_size = 320
+patch_size = 16
+num_queries = 20
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=320,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='GTMHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=100)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=True,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[7, 11],
+    decay_ratio=0.1,
+    max_epoch=12)
+launcher = 'pytorch'
+distributed = True
+rank = 0
+world_size = 4
+2025-07-07 11:19:56,830 - PropVG - INFO - RefZOM-test size: 21770
+2025-07-07 11:20:02,074 - PropVG - INFO - loaded checkpoint from work_dir/refzom/PropVG-refzom.pth
+2025-07-07 11:20:02,098 - PropVG - INFO - PropVG - evaluating set val
+2025-07-07 11:22:14,202 - PropVG - INFO - ------------ validate ------------time: 132.10, mIoU: 71.15, oIoU: 71.95, macc: 98.11, MaskACC@0.5-0.9: [81.03, 76.58, 70.25, 57.06, 25.40
+2025-07-07 11:22:15,468 - PropVG - INFO - sucessfully save the results to work_dir/refzom/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!

rrefcoco/PropVG-rrefcoco.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c03212f30621e0c364a677ad942c3498ffbab00c7851ed4faee0efb3b858371
+size 987093029

rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx ADDED Viewed

Binary file (5.08 kB). View file

rrefcoco/test_log.txt ADDED Viewed

	@@ -0,0 +1,314 @@

+2025-07-07 11:46:17,817 - PropVG - INFO - dataset = 'RRefCOCO'
+data_root = './data/seqtr_type/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+train_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RRefCOCO',
+        use_token_type='beit3',
+        refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox', 'gt_mask_parts_rle'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index', 'tokenized_words'
+        ])
+]
+val_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFileGRES_TO',
+        max_token=50,
+        with_mask=True,
+        with_bbox=True,
+        dataset='RRefCOCO',
+        use_token_type='beit3',
+        refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+        object_area_filter=100,
+        object_area_rate_filter=[0.05, 0.8]),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='CollectData',
+        keys=[
+            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
+            'gt_bbox', 'gt_mask_parts_rle'
+        ],
+        meta_keys=[
+            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
+            'scale_factor', 'gt_ori_mask', 'target', 'empty',
+            'refer_target_index', 'tokenized_words'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadImageAnnotationsFromFile',
+        max_token=20,
+        with_bbox=True,
+        dataset='RRefCOCO'),
+    dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='CollectData', keys=['img', 'ref_expr_inds', 'gt_bbox'])
+]
+word_emb_cfg = dict(type='GloVe')
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RRefCOCO',
+        which_set='train',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RRefCOCO',
+                use_token_type='beit3',
+                refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_rrefcoco=dict(
+        type='RRefCOCO',
+        which_set='val_rrefcoco',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RRefCOCO',
+                use_token_type='beit3',
+                refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_rrefcocoplus=dict(
+        type='RRefCOCO',
+        which_set='val_rrefcoco+',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RRefCOCO',
+                use_token_type='beit3',
+                refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')),
+    val_rrefcocog=dict(
+        type='RRefCOCO',
+        which_set='val_rrefcocog',
+        img_source=['coco'],
+        annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
+        imgsfile='./data/seqtr_type/images/mscoco/train2014',
+        pipeline=[
+            dict(
+                type='LoadImageAnnotationsFromFileGRES_TO',
+                max_token=50,
+                with_mask=True,
+                with_bbox=True,
+                dataset='RRefCOCO',
+                use_token_type='beit3',
+                refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
+                object_area_filter=100,
+                object_area_rate_filter=[0.05, 0.8]),
+            dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375]),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='CollectData',
+                keys=[
+                    'img', 'ref_expr_inds', 'text_attention_mask',
+                    'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
+                ],
+                meta_keys=[
+                    'filename', 'expression', 'ori_shape', 'img_shape',
+                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
+                    'empty', 'refer_target_index', 'tokenized_words'
+                ])
+        ],
+        word_emb_cfg=dict(type='GloVe')))
+ema = False
+ema_factor = 0.999
+use_fp16 = False
+seed = 6666
+deterministic = True
+log_level = 'INFO'
+log_interval = 50
+save_interval = -1
+resume_from = None
+load_from = 'work_dir/rrefcoco/PropVG-rrefcoco.pth'
+finetune_from = None
+evaluate_interval = 1
+start_evaluate_epoch = 0
+start_save_checkpoint = 9
+max_token = 50
+img_size = 320
+patch_size = 16
+num_queries = 20
+model = dict(
+    type='MIXRefUniModel_OMG',
+    vis_enc=dict(
+        type='BEIT3',
+        img_size=320,
+        patch_size=16,
+        vit_type='base',
+        drop_path_rate=0.1,
+        vocab_size=64010,
+        freeze_layer=-1,
+        vision_embed_proj_interpolate=False,
+        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
+    lan_enc=None,
+    fusion=None,
+    head=dict(
+        type='GTMHead',
+        input_channels=768,
+        hidden_channels=256,
+        num_queries=20,
+        detr_loss=dict(
+            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
+            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
+        loss_weight=dict(
+            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
+            bbox=0.1,
+            allbbox=0.1,
+            refer=1.0),
+        MTD=dict(K=250)),
+    post_params=dict(
+        score_weighted=False,
+        mask_threshold=0.5,
+        score_threshold=0.7,
+        with_nms=False,
+        with_mask=True),
+    process_visual=True,
+    visualize_params=dict(row_columns=(4, 5)),
+    visual_mode='test')
+grad_norm_clip = 0.15
+lr = 0.0005
+optimizer_config = dict(
+    type='Adam',
+    lr=0.0005,
+    lr_vis_enc=5e-05,
+    lr_lan_enc=0.0005,
+    betas=(0.9, 0.98),
+    eps=1e-09,
+    weight_decay=0,
+    amsgrad=True)
+scheduler_config = dict(
+    type='MultiStepLRWarmUp',
+    warmup_epochs=1,
+    decay_steps=[7, 11],
+    decay_ratio=0.1,
+    max_epoch=12)
+launcher = 'none'
+distributed = False
+rank = 0
+world_size = 1
+2025-07-07 11:46:34,374 - PropVG - INFO - RRefCOCO-val_rrefcoco size: 52229
+2025-07-07 11:46:53,442 - PropVG - INFO - RRefCOCO-val_rrefcoco+ size: 49620
+2025-07-07 11:47:11,525 - PropVG - INFO - RRefCOCO-val_rrefcocog size: 33960
+2025-07-07 11:47:16,069 - PropVG - INFO - loaded checkpoint from work_dir/rrefcoco/PropVG-rrefcoco.pth
+2025-07-07 11:47:16,070 - PropVG - INFO - PropVG - evaluating set val_rrefcoco
+2025-07-07 11:58:15,741 - PropVG - INFO - ------------ validate ------------time: 659.65, mIoU: 75.86, oIoU: 76.87, mRR: 93.03, rIoU: 62.91
+2025-07-07 11:58:18,322 - PropVG - INFO - PropVG - evaluating set val_rrefcoco+
+2025-07-07 12:07:56,811 - PropVG - INFO - ------------ validate ------------time: 578.47, mIoU: 69.39, oIoU: 69.17, mRR: 94.96, rIoU: 59.44
+2025-07-07 12:07:58,975 - PropVG - INFO - PropVG - evaluating set val_rrefcocog
+2025-07-07 12:14:35,849 - PropVG - INFO - ------------ validate ------------time: 396.86, mIoU: 69.20, oIoU: 70.13, mRR: 93.85, rIoU: 56.17
+2025-07-07 12:14:37,866 - PropVG - INFO - sucessfully save the results to work_dir/rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx !!!