2025-07-07 11:46:17,817 - PropVG - INFO - dataset = 'RRefCOCO' data_root = './data/seqtr_type/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) train_pipeline = [ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ] val_pipeline = [ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ] test_pipeline = [ dict( type='LoadImageAnnotationsFromFile', max_token=20, with_bbox=True, dataset='RRefCOCO'), dict(type='Resize', img_scale=(512, 512), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='CollectData', keys=['img', 'ref_expr_inds', 'gt_bbox']) ] word_emb_cfg = dict(type='GloVe') data = dict( samples_per_gpu=16, workers_per_gpu=4, train=dict( type='RRefCOCO', which_set='train', img_source=['coco'], annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json', imgsfile='./data/seqtr_type/images/mscoco/train2014', pipeline=[ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ], word_emb_cfg=dict(type='GloVe')), val_rrefcoco=dict( type='RRefCOCO', which_set='val_rrefcoco', img_source=['coco'], annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json', imgsfile='./data/seqtr_type/images/mscoco/train2014', pipeline=[ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ], word_emb_cfg=dict(type='GloVe')), val_rrefcocoplus=dict( type='RRefCOCO', which_set='val_rrefcoco+', img_source=['coco'], annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json', imgsfile='./data/seqtr_type/images/mscoco/train2014', pipeline=[ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ], word_emb_cfg=dict(type='GloVe')), val_rrefcocog=dict( type='RRefCOCO', which_set='val_rrefcocog', img_source=['coco'], annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json', imgsfile='./data/seqtr_type/images/mscoco/train2014', pipeline=[ dict( type='LoadImageAnnotationsFromFileGRES_TO', max_token=50, with_mask=True, with_bbox=True, dataset='RRefCOCO', use_token_type='beit3', refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json', object_area_filter=100, object_area_rate_filter=[0.05, 0.8]), dict(type='Resize', img_scale=(320, 320), keep_ratio=False), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='DefaultFormatBundle'), dict( type='CollectData', keys=[ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle' ], meta_keys=[ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target', 'empty', 'refer_target_index', 'tokenized_words' ]) ], word_emb_cfg=dict(type='GloVe'))) ema = False ema_factor = 0.999 use_fp16 = False seed = 6666 deterministic = True log_level = 'INFO' log_interval = 50 save_interval = -1 resume_from = None load_from = 'work_dir/rrefcoco/PropVG-rrefcoco.pth' finetune_from = None evaluate_interval = 1 start_evaluate_epoch = 0 start_save_checkpoint = 9 max_token = 50 img_size = 320 patch_size = 16 num_queries = 20 model = dict( type='MIXRefUniModel_OMG', vis_enc=dict( type='BEIT3', img_size=320, patch_size=16, vit_type='base', drop_path_rate=0.1, vocab_size=64010, freeze_layer=-1, vision_embed_proj_interpolate=False, pretrain='pretrain_weights/beit3_base_patch16_224.zip'), lan_enc=None, fusion=None, head=dict( type='GTMHead', input_channels=768, hidden_channels=256, num_queries=20, detr_loss=dict( criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0), matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)), loss_weight=dict( mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0), bbox=0.1, allbbox=0.1, refer=1.0), MTD=dict(K=250)), post_params=dict( score_weighted=False, mask_threshold=0.5, score_threshold=0.7, with_nms=False, with_mask=True), process_visual=True, visualize_params=dict(row_columns=(4, 5)), visual_mode='test') grad_norm_clip = 0.15 lr = 0.0005 optimizer_config = dict( type='Adam', lr=0.0005, lr_vis_enc=5e-05, lr_lan_enc=0.0005, betas=(0.9, 0.98), eps=1e-09, weight_decay=0, amsgrad=True) scheduler_config = dict( type='MultiStepLRWarmUp', warmup_epochs=1, decay_steps=[7, 11], decay_ratio=0.1, max_epoch=12) launcher = 'none' distributed = False rank = 0 world_size = 1 2025-07-07 11:46:34,374 - PropVG - INFO - RRefCOCO-val_rrefcoco size: 52229 2025-07-07 11:46:53,442 - PropVG - INFO - RRefCOCO-val_rrefcoco+ size: 49620 2025-07-07 11:47:11,525 - PropVG - INFO - RRefCOCO-val_rrefcocog size: 33960 2025-07-07 11:47:16,069 - PropVG - INFO - loaded checkpoint from work_dir/rrefcoco/PropVG-rrefcoco.pth 2025-07-07 11:47:16,070 - PropVG - INFO - PropVG - evaluating set val_rrefcoco 2025-07-07 11:58:15,741 - PropVG - INFO - ------------ validate ------------time: 659.65, mIoU: 75.86, oIoU: 76.87, mRR: 93.03, rIoU: 62.91 2025-07-07 11:58:18,322 - PropVG - INFO - PropVG - evaluating set val_rrefcoco+ 2025-07-07 12:07:56,811 - PropVG - INFO - ------------ validate ------------time: 578.47, mIoU: 69.39, oIoU: 69.17, mRR: 94.96, rIoU: 59.44 2025-07-07 12:07:58,975 - PropVG - INFO - PropVG - evaluating set val_rrefcocog 2025-07-07 12:14:35,849 - PropVG - INFO - ------------ validate ------------time: 396.86, mIoU: 69.20, oIoU: 70.13, mRR: 93.85, rIoU: 56.17 2025-07-07 12:14:37,866 - PropVG - INFO - sucessfully save the results to work_dir/rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx !!!