File size: 12,279 Bytes
a482a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
2025-07-07 11:04:05,796 - PropVG - INFO - dataset = 'RefCOCOUNC'
data_root = './data/seqtr_type/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
train_pipeline = [
    dict(
        type='LoadImageAnnotationsFromFile_TO',
        max_token=20,
        with_mask=True,
        with_bbox=True,
        dataset='RefCOCOUNC',
        use_token_type='beit3',
        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
        object_area_filter=100,
        object_area_rate_filter=[0.05, 0.8]),
    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
    dict(
        type='Normalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375]),
    dict(type='DefaultFormatBundle'),
    dict(
        type='CollectData',
        keys=[
            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
            'gt_bbox'
        ],
        meta_keys=[
            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
            'scale_factor', 'gt_ori_mask', 'target', 'empty',
            'refer_target_index'
        ])
]
val_pipeline = [
    dict(
        type='LoadImageAnnotationsFromFile_TO',
        max_token=20,
        with_mask=True,
        with_bbox=True,
        dataset='RefCOCOUNC',
        use_token_type='beit3',
        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
        object_area_filter=100,
        object_area_rate_filter=[0.05, 0.8]),
    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
    dict(
        type='Normalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375]),
    dict(type='DefaultFormatBundle'),
    dict(
        type='CollectData',
        keys=[
            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
            'gt_bbox'
        ],
        meta_keys=[
            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
            'scale_factor', 'gt_ori_mask', 'target', 'empty',
            'refer_target_index'
        ])
]
test_pipeline = [
    dict(
        type='LoadImageAnnotationsFromFile_TO',
        max_token=20,
        with_mask=True,
        with_bbox=True,
        dataset='RefCOCOUNC',
        use_token_type='beit3',
        refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
        object_area_filter=100,
        object_area_rate_filter=[0.05, 0.8]),
    dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
    dict(
        type='Normalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375]),
    dict(type='DefaultFormatBundle'),
    dict(
        type='CollectData',
        keys=[
            'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
            'gt_bbox'
        ],
        meta_keys=[
            'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
            'scale_factor', 'gt_ori_mask', 'target', 'empty',
            'refer_target_index'
        ])
]
word_emb_cfg = dict(type='GloVe')
data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    train=dict(
        type='RefCOCOUNC',
        which_set='train',
        img_source=['coco'],
        annsfile=
        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
        imgsfile='./data/seqtr_type/images/mscoco/train2014',
        pipeline=[
            dict(
                type='LoadImageAnnotationsFromFile_TO',
                max_token=20,
                with_mask=True,
                with_bbox=True,
                dataset='RefCOCOUNC',
                use_token_type='beit3',
                refer_file=
                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
                object_area_filter=100,
                object_area_rate_filter=[0.05, 0.8]),
            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375]),
            dict(type='DefaultFormatBundle'),
            dict(
                type='CollectData',
                keys=[
                    'img', 'ref_expr_inds', 'text_attention_mask',
                    'gt_mask_rle', 'gt_bbox'
                ],
                meta_keys=[
                    'filename', 'expression', 'ori_shape', 'img_shape',
                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
                    'empty', 'refer_target_index'
                ])
        ],
        word_emb_cfg=dict(type='GloVe')),
    val=dict(
        type='RefCOCOUNC',
        which_set='val',
        img_source=['coco'],
        annsfile=
        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
        imgsfile='./data/seqtr_type/images/mscoco/train2014',
        pipeline=[
            dict(
                type='LoadImageAnnotationsFromFile_TO',
                max_token=20,
                with_mask=True,
                with_bbox=True,
                dataset='RefCOCOUNC',
                use_token_type='beit3',
                refer_file=
                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
                object_area_filter=100,
                object_area_rate_filter=[0.05, 0.8]),
            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375]),
            dict(type='DefaultFormatBundle'),
            dict(
                type='CollectData',
                keys=[
                    'img', 'ref_expr_inds', 'text_attention_mask',
                    'gt_mask_rle', 'gt_bbox'
                ],
                meta_keys=[
                    'filename', 'expression', 'ori_shape', 'img_shape',
                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
                    'empty', 'refer_target_index'
                ])
        ],
        word_emb_cfg=dict(type='GloVe')),
    testA=dict(
        type='RefCOCOUNC',
        which_set='testA',
        img_source=['coco'],
        annsfile=
        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
        imgsfile='./data/seqtr_type/images/mscoco/train2014',
        pipeline=[
            dict(
                type='LoadImageAnnotationsFromFile_TO',
                max_token=20,
                with_mask=True,
                with_bbox=True,
                dataset='RefCOCOUNC',
                use_token_type='beit3',
                refer_file=
                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
                object_area_filter=100,
                object_area_rate_filter=[0.05, 0.8]),
            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375]),
            dict(type='DefaultFormatBundle'),
            dict(
                type='CollectData',
                keys=[
                    'img', 'ref_expr_inds', 'text_attention_mask',
                    'gt_mask_rle', 'gt_bbox'
                ],
                meta_keys=[
                    'filename', 'expression', 'ori_shape', 'img_shape',
                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
                    'empty', 'refer_target_index'
                ])
        ],
        word_emb_cfg=dict(type='GloVe')),
    testB=dict(
        type='RefCOCOUNC',
        which_set='testB',
        img_source=['coco'],
        annsfile=
        './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
        imgsfile='./data/seqtr_type/images/mscoco/train2014',
        pipeline=[
            dict(
                type='LoadImageAnnotationsFromFile_TO',
                max_token=20,
                with_mask=True,
                with_bbox=True,
                dataset='RefCOCOUNC',
                use_token_type='beit3',
                refer_file=
                'data/seqtr_type/annotations/mixed-seg/coco_all.json',
                object_area_filter=100,
                object_area_rate_filter=[0.05, 0.8]),
            dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375]),
            dict(type='DefaultFormatBundle'),
            dict(
                type='CollectData',
                keys=[
                    'img', 'ref_expr_inds', 'text_attention_mask',
                    'gt_mask_rle', 'gt_bbox'
                ],
                meta_keys=[
                    'filename', 'expression', 'ori_shape', 'img_shape',
                    'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
                    'empty', 'refer_target_index'
                ])
        ],
        word_emb_cfg=dict(type='GloVe')))
ema = False
ema_factor = 0.999
use_fp16 = False
seed = 6666
deterministic = True
log_level = 'INFO'
log_interval = 50
save_interval = -1
resume_from = None
load_from = 'work_dir/refcoco/PropVG-refcoco.pth'
finetune_from = None
evaluate_interval = 1
start_evaluate_epoch = 0
start_save_checkpoint = 20
max_token = 20
img_size = 384
patch_size = 16
model = dict(
    type='MIXRefUniModel_OMG',
    vis_enc=dict(
        type='BEIT3',
        img_size=384,
        patch_size=16,
        vit_type='base',
        drop_path_rate=0.1,
        vocab_size=64010,
        freeze_layer=-1,
        vision_embed_proj_interpolate=False,
        pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
    lan_enc=None,
    fusion=None,
    head=dict(
        type='REFHead',
        input_channels=768,
        hidden_channels=256,
        num_queries=20,
        detr_loss=dict(
            criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
            matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
        loss_weight=dict(
            mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
            bbox=0.1,
            allbbox=0.1,
            refer=1.0),
        MTD=dict(K=100)),
    post_params=dict(
        score_weighted=False,
        mask_threshold=0.5,
        score_threshold=0.7,
        with_nms=False,
        with_mask=True),
    process_visual=True,
    visualize_params=dict(row_columns=(4, 5)),
    visual_mode='test')
grad_norm_clip = 0.15
lr = 0.0005
optimizer_config = dict(
    type='Adam',
    lr=0.0005,
    lr_vis_enc=5e-05,
    lr_lan_enc=0.0005,
    betas=(0.9, 0.98),
    eps=1e-09,
    weight_decay=0,
    amsgrad=True)
scheduler_config = dict(
    type='MultiStepLRWarmUp',
    warmup_epochs=1,
    decay_steps=[21, 27],
    decay_ratio=0.1,
    max_epoch=30)
launcher = 'pytorch'
distributed = True
rank = 0
world_size = 4

2025-07-07 11:04:11,542 - PropVG - INFO - RefCOCOUNC-val size: 10834
2025-07-07 11:04:17,084 - PropVG - INFO - RefCOCOUNC-testA size: 5657
2025-07-07 11:04:22,843 - PropVG - INFO - RefCOCOUNC-testB size: 5095
2025-07-07 11:04:28,381 - PropVG - INFO - loaded checkpoint from work_dir/refcoco/PropVG-refcoco.pth

2025-07-07 11:04:28,382 - PropVG - INFO - PropVG - evaluating set val
2025-07-07 11:06:19,535 - PropVG - INFO - ------------ validate ------------  time: 111.15, DetACC: 88.95, mIoU: 77.98, oIoU: 76.79, MaskACC@0.5-0.9: [89.14, 86.33, 81.66,  70.75,  36.42]DetACC@0.5-0.9: [88.95, 86.66, 82.62,  73.65,  47.82]
2025-07-07 11:06:21,284 - PropVG - INFO - PropVG - evaluating set testA
2025-07-07 11:07:29,418 - PropVG - INFO - ------------ validate ------------  time: 68.13, DetACC: 91.55, mIoU: 79.81, oIoU: 79.57, MaskACC@0.5-0.9: [91.66, 89.84, 85.42,  73.96,  36.22]DetACC@0.5-0.9: [91.55, 89.95, 85.94,  77.69,  51.57]
2025-07-07 11:07:30,844 - PropVG - INFO - PropVG - evaluating set testB
2025-07-07 11:08:36,434 - PropVG - INFO - ------------ validate ------------  time: 65.59, DetACC: 85.73, mIoU: 75.28, oIoU: 73.68, MaskACC@0.5-0.9: [84.95, 81.26, 76.06,  65.64,  38.81]DetACC@0.5-0.9: [85.73, 82.06, 76.20,  66.88,  41.97]
2025-07-07 11:08:37,918 - PropVG - INFO - sucessfully save the results to work_dir/refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!