Dmmm997 commited on
Commit
a482a69
·
verified ·
1 Parent(s): 8c86618

Upload 21 files

Browse files
grefcoco/PropVG-grefcoco.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410cf0998478247598391597dc0da8f287079ade292257380e352d2dc4b64084
3
+ size 987093029
grefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx ADDED
Binary file (5.16 kB). View file
 
grefcoco/test_log.txt ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 10:57:14,028 - PropVG - INFO - dataset = 'GRefCOCO'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFileGRES_TO',
8
+ max_token=50,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='GRefCOCO',
12
+ use_token_type='beit3',
13
+ refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox', 'gt_mask_parts_rle'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index', 'tokenized_words'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFileGRES_TO',
37
+ max_token=50,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='GRefCOCO',
41
+ use_token_type='beit3',
42
+ refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox', 'gt_mask_parts_rle'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index', 'tokenized_words'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFileGRES_TO',
66
+ max_token=50,
67
+ with_mask=True,
68
+ with_bbox=True,
69
+ dataset='GRefCOCO',
70
+ use_token_type='beit3',
71
+ refer_file='data/seqtr_type/annotations/grefs/coco_annotations.json',
72
+ object_area_filter=100,
73
+ object_area_rate_filter=[0.05, 0.8]),
74
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
75
+ dict(
76
+ type='Normalize',
77
+ mean=[123.675, 116.28, 103.53],
78
+ std=[58.395, 57.12, 57.375]),
79
+ dict(type='DefaultFormatBundle'),
80
+ dict(
81
+ type='CollectData',
82
+ keys=[
83
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
84
+ 'gt_bbox', 'gt_mask_parts_rle'
85
+ ],
86
+ meta_keys=[
87
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
88
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
89
+ 'refer_target_index', 'tokenized_words'
90
+ ])
91
+ ]
92
+ word_emb_cfg = dict(type='GloVe')
93
+ data = dict(
94
+ samples_per_gpu=16,
95
+ workers_per_gpu=4,
96
+ train=dict(
97
+ type='GRefCOCO',
98
+ which_set='train',
99
+ img_source=['coco'],
100
+ annsfile='./data/seqtr_type/annotations/grefs/instances.json',
101
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
102
+ pipeline=[
103
+ dict(
104
+ type='LoadImageAnnotationsFromFileGRES_TO',
105
+ max_token=50,
106
+ with_mask=True,
107
+ with_bbox=True,
108
+ dataset='GRefCOCO',
109
+ use_token_type='beit3',
110
+ refer_file=
111
+ 'data/seqtr_type/annotations/grefs/coco_annotations.json',
112
+ object_area_filter=100,
113
+ object_area_rate_filter=[0.05, 0.8]),
114
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
115
+ dict(
116
+ type='Normalize',
117
+ mean=[123.675, 116.28, 103.53],
118
+ std=[58.395, 57.12, 57.375]),
119
+ dict(type='DefaultFormatBundle'),
120
+ dict(
121
+ type='CollectData',
122
+ keys=[
123
+ 'img', 'ref_expr_inds', 'text_attention_mask',
124
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
125
+ ],
126
+ meta_keys=[
127
+ 'filename', 'expression', 'ori_shape', 'img_shape',
128
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
129
+ 'empty', 'refer_target_index', 'tokenized_words'
130
+ ])
131
+ ],
132
+ word_emb_cfg=dict(type='GloVe')),
133
+ val=dict(
134
+ type='GRefCOCO',
135
+ which_set='val',
136
+ img_source=['coco'],
137
+ annsfile='./data/seqtr_type/annotations/grefs/instances.json',
138
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
139
+ pipeline=[
140
+ dict(
141
+ type='LoadImageAnnotationsFromFileGRES_TO',
142
+ max_token=50,
143
+ with_mask=True,
144
+ with_bbox=True,
145
+ dataset='GRefCOCO',
146
+ use_token_type='beit3',
147
+ refer_file=
148
+ 'data/seqtr_type/annotations/grefs/coco_annotations.json',
149
+ object_area_filter=100,
150
+ object_area_rate_filter=[0.05, 0.8]),
151
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
152
+ dict(
153
+ type='Normalize',
154
+ mean=[123.675, 116.28, 103.53],
155
+ std=[58.395, 57.12, 57.375]),
156
+ dict(type='DefaultFormatBundle'),
157
+ dict(
158
+ type='CollectData',
159
+ keys=[
160
+ 'img', 'ref_expr_inds', 'text_attention_mask',
161
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
162
+ ],
163
+ meta_keys=[
164
+ 'filename', 'expression', 'ori_shape', 'img_shape',
165
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
166
+ 'empty', 'refer_target_index', 'tokenized_words'
167
+ ])
168
+ ],
169
+ word_emb_cfg=dict(type='GloVe')),
170
+ testA=dict(
171
+ type='GRefCOCO',
172
+ which_set='testA',
173
+ img_source=['coco'],
174
+ annsfile='./data/seqtr_type/annotations/grefs/instances.json',
175
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
176
+ pipeline=[
177
+ dict(
178
+ type='LoadImageAnnotationsFromFileGRES_TO',
179
+ max_token=50,
180
+ with_mask=True,
181
+ with_bbox=True,
182
+ dataset='GRefCOCO',
183
+ use_token_type='beit3',
184
+ refer_file=
185
+ 'data/seqtr_type/annotations/grefs/coco_annotations.json',
186
+ object_area_filter=100,
187
+ object_area_rate_filter=[0.05, 0.8]),
188
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
189
+ dict(
190
+ type='Normalize',
191
+ mean=[123.675, 116.28, 103.53],
192
+ std=[58.395, 57.12, 57.375]),
193
+ dict(type='DefaultFormatBundle'),
194
+ dict(
195
+ type='CollectData',
196
+ keys=[
197
+ 'img', 'ref_expr_inds', 'text_attention_mask',
198
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
199
+ ],
200
+ meta_keys=[
201
+ 'filename', 'expression', 'ori_shape', 'img_shape',
202
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
203
+ 'empty', 'refer_target_index', 'tokenized_words'
204
+ ])
205
+ ],
206
+ word_emb_cfg=dict(type='GloVe')),
207
+ testB=dict(
208
+ type='GRefCOCO',
209
+ which_set='testB',
210
+ img_source=['coco'],
211
+ annsfile='./data/seqtr_type/annotations/grefs/instances.json',
212
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
213
+ pipeline=[
214
+ dict(
215
+ type='LoadImageAnnotationsFromFileGRES_TO',
216
+ max_token=50,
217
+ with_mask=True,
218
+ with_bbox=True,
219
+ dataset='GRefCOCO',
220
+ use_token_type='beit3',
221
+ refer_file=
222
+ 'data/seqtr_type/annotations/grefs/coco_annotations.json',
223
+ object_area_filter=100,
224
+ object_area_rate_filter=[0.05, 0.8]),
225
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
226
+ dict(
227
+ type='Normalize',
228
+ mean=[123.675, 116.28, 103.53],
229
+ std=[58.395, 57.12, 57.375]),
230
+ dict(type='DefaultFormatBundle'),
231
+ dict(
232
+ type='CollectData',
233
+ keys=[
234
+ 'img', 'ref_expr_inds', 'text_attention_mask',
235
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
236
+ ],
237
+ meta_keys=[
238
+ 'filename', 'expression', 'ori_shape', 'img_shape',
239
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
240
+ 'empty', 'refer_target_index', 'tokenized_words'
241
+ ])
242
+ ],
243
+ word_emb_cfg=dict(type='GloVe')))
244
+ ema = False
245
+ ema_factor = 0.999
246
+ use_fp16 = False
247
+ seed = 6666
248
+ deterministic = True
249
+ log_level = 'INFO'
250
+ log_interval = 50
251
+ save_interval = -1
252
+ resume_from = None
253
+ load_from = 'work_dir/gres/PropVG-grefcoco.pth'
254
+ finetune_from = None
255
+ evaluate_interval = 1
256
+ start_evaluate_epoch = 0
257
+ start_save_checkpoint = 7
258
+ max_token = 50
259
+ img_size = 320
260
+ patch_size = 16
261
+ model = dict(
262
+ type='MIXGrefUniModel_OMG',
263
+ vis_enc=dict(
264
+ type='BEIT3',
265
+ img_size=320,
266
+ patch_size=16,
267
+ vit_type='base',
268
+ drop_path_rate=0.1,
269
+ vocab_size=64010,
270
+ freeze_layer=-1,
271
+ vision_embed_proj_interpolate=False,
272
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
273
+ lan_enc=None,
274
+ fusion=None,
275
+ head=dict(
276
+ type='GTMHead',
277
+ input_channels=768,
278
+ hidden_channels=256,
279
+ num_queries=20,
280
+ detr_loss=dict(
281
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
282
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
283
+ loss_weight=dict(
284
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
285
+ bbox=0.1,
286
+ allbbox=0.1,
287
+ refer=1.0),
288
+ MTD=dict(K=250)),
289
+ post_params=dict(
290
+ score_weighted=False,
291
+ mask_threshold=0.5,
292
+ score_threshold=0.7,
293
+ with_nms=False,
294
+ with_mask=True),
295
+ process_visual=False,
296
+ visualize_params=dict(row_columns=(4, 5)),
297
+ visual_mode='test')
298
+ grad_norm_clip = 0.15
299
+ lr = 0.0005
300
+ optimizer_config = dict(
301
+ type='Adam',
302
+ lr=0.0005,
303
+ lr_vis_enc=5e-05,
304
+ lr_lan_enc=0.0005,
305
+ betas=(0.9, 0.98),
306
+ eps=1e-09,
307
+ weight_decay=0,
308
+ amsgrad=True)
309
+ scheduler_config = dict(
310
+ type='MultiStepLRWarmUp',
311
+ warmup_epochs=1,
312
+ decay_steps=[7, 11],
313
+ decay_ratio=0.1,
314
+ max_epoch=12)
315
+ launcher = 'pytorch'
316
+ distributed = True
317
+ rank = 0
318
+ world_size = 4
319
+
320
+ 2025-07-07 10:57:25,861 - PropVG - INFO - GRefCOCO-val size: 16870
321
+ 2025-07-07 10:57:37,626 - PropVG - INFO - GRefCOCO-testA size: 18712
322
+ 2025-07-07 10:57:49,703 - PropVG - INFO - GRefCOCO-testB size: 14933
323
+ 2025-07-07 10:57:55,300 - PropVG - INFO - loaded checkpoint from work_dir/gres/PropVG-grefcoco.pth
324
+
325
+ 2025-07-07 10:57:55,323 - PropVG - INFO - PropVG - evaluating set val
326
+ 2025-07-07 10:59:51,470 - PropVG - INFO - ------------ validate ------------ time: 116.14, F1score: 72.16, Nacc: 72.83, Tacc: 96.93, gIoU: 73.29, cIoU: 69.23, MaskACC@0.7-0.9: [74.74, 60.99, 23.42
327
+ 2025-07-07 10:59:52,918 - PropVG - INFO - PropVG - evaluating set testA
328
+ 2025-07-07 11:01:57,887 - PropVG - INFO - ------------ validate ------------ time: 124.96, F1score: 68.77, Nacc: 69.87, Tacc: 96.56, gIoU: 74.43, cIoU: 74.20, MaskACC@0.7-0.9: [77.48, 65.93, 30.06
329
+ 2025-07-07 11:01:59,563 - PropVG - INFO - PropVG - evaluating set testB
330
+ 2025-07-07 11:03:41,160 - PropVG - INFO - ------------ validate ------------ time: 101.59, F1score: 59.02, Nacc: 64.97, Tacc: 91.68, gIoU: 65.87, cIoU: 64.76, MaskACC@0.7-0.9: [62.03, 51.61, 28.43
331
+ 2025-07-07 11:03:42,844 - PropVG - INFO - sucessfully save the results to work_dir/gres/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx !!!
refcoco+/PropVG-refcoco+.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15fdea912cd5ac8722ff2a12954cc621ff21bea2e19bae900191f63419ea335e
3
+ size 987633701
refcoco+/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED
Binary file (5.2 kB). View file
 
refcoco+/test_log.txt ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:09:02,802 - PropVG - INFO - dataset = 'RefCOCOPlusUNC'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFile_TO',
8
+ max_token=20,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='RefCOCOPlusUNC',
12
+ use_token_type='beit3',
13
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFile_TO',
37
+ max_token=20,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='RefCOCOPlusUNC',
41
+ use_token_type='beit3',
42
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFile_TO',
66
+ max_token=20,
67
+ with_mask=True,
68
+ with_bbox=True,
69
+ dataset='RefCOCOPlusUNC',
70
+ use_token_type='beit3',
71
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
72
+ object_area_filter=100,
73
+ object_area_rate_filter=[0.05, 0.8]),
74
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
75
+ dict(
76
+ type='Normalize',
77
+ mean=[123.675, 116.28, 103.53],
78
+ std=[58.395, 57.12, 57.375]),
79
+ dict(type='DefaultFormatBundle'),
80
+ dict(
81
+ type='CollectData',
82
+ keys=[
83
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
84
+ 'gt_bbox'
85
+ ],
86
+ meta_keys=[
87
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
88
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
89
+ 'refer_target_index'
90
+ ])
91
+ ]
92
+ word_emb_cfg = dict(type='GloVe')
93
+ data = dict(
94
+ samples_per_gpu=8,
95
+ workers_per_gpu=4,
96
+ train=dict(
97
+ type='RefCOCOPlusUNC',
98
+ which_set='train',
99
+ img_source=['coco'],
100
+ annsfile=
101
+ './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
102
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
103
+ pipeline=[
104
+ dict(
105
+ type='LoadImageAnnotationsFromFile_TO',
106
+ max_token=20,
107
+ with_mask=True,
108
+ with_bbox=True,
109
+ dataset='RefCOCOPlusUNC',
110
+ use_token_type='beit3',
111
+ refer_file=
112
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
113
+ object_area_filter=100,
114
+ object_area_rate_filter=[0.05, 0.8]),
115
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
116
+ dict(
117
+ type='Normalize',
118
+ mean=[123.675, 116.28, 103.53],
119
+ std=[58.395, 57.12, 57.375]),
120
+ dict(type='DefaultFormatBundle'),
121
+ dict(
122
+ type='CollectData',
123
+ keys=[
124
+ 'img', 'ref_expr_inds', 'text_attention_mask',
125
+ 'gt_mask_rle', 'gt_bbox'
126
+ ],
127
+ meta_keys=[
128
+ 'filename', 'expression', 'ori_shape', 'img_shape',
129
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
130
+ 'empty', 'refer_target_index'
131
+ ])
132
+ ],
133
+ word_emb_cfg=dict(type='GloVe')),
134
+ val=dict(
135
+ type='RefCOCOPlusUNC',
136
+ which_set='val',
137
+ img_source=['coco'],
138
+ annsfile=
139
+ './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
140
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
141
+ pipeline=[
142
+ dict(
143
+ type='LoadImageAnnotationsFromFile_TO',
144
+ max_token=20,
145
+ with_mask=True,
146
+ with_bbox=True,
147
+ dataset='RefCOCOPlusUNC',
148
+ use_token_type='beit3',
149
+ refer_file=
150
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
151
+ object_area_filter=100,
152
+ object_area_rate_filter=[0.05, 0.8]),
153
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
154
+ dict(
155
+ type='Normalize',
156
+ mean=[123.675, 116.28, 103.53],
157
+ std=[58.395, 57.12, 57.375]),
158
+ dict(type='DefaultFormatBundle'),
159
+ dict(
160
+ type='CollectData',
161
+ keys=[
162
+ 'img', 'ref_expr_inds', 'text_attention_mask',
163
+ 'gt_mask_rle', 'gt_bbox'
164
+ ],
165
+ meta_keys=[
166
+ 'filename', 'expression', 'ori_shape', 'img_shape',
167
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
168
+ 'empty', 'refer_target_index'
169
+ ])
170
+ ],
171
+ word_emb_cfg=dict(type='GloVe')),
172
+ testA=dict(
173
+ type='RefCOCOPlusUNC',
174
+ which_set='testA',
175
+ img_source=['coco'],
176
+ annsfile=
177
+ './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
178
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
179
+ pipeline=[
180
+ dict(
181
+ type='LoadImageAnnotationsFromFile_TO',
182
+ max_token=20,
183
+ with_mask=True,
184
+ with_bbox=True,
185
+ dataset='RefCOCOPlusUNC',
186
+ use_token_type='beit3',
187
+ refer_file=
188
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
189
+ object_area_filter=100,
190
+ object_area_rate_filter=[0.05, 0.8]),
191
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
192
+ dict(
193
+ type='Normalize',
194
+ mean=[123.675, 116.28, 103.53],
195
+ std=[58.395, 57.12, 57.375]),
196
+ dict(type='DefaultFormatBundle'),
197
+ dict(
198
+ type='CollectData',
199
+ keys=[
200
+ 'img', 'ref_expr_inds', 'text_attention_mask',
201
+ 'gt_mask_rle', 'gt_bbox'
202
+ ],
203
+ meta_keys=[
204
+ 'filename', 'expression', 'ori_shape', 'img_shape',
205
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
206
+ 'empty', 'refer_target_index'
207
+ ])
208
+ ],
209
+ word_emb_cfg=dict(type='GloVe')),
210
+ testB=dict(
211
+ type='RefCOCOPlusUNC',
212
+ which_set='testB',
213
+ img_source=['coco'],
214
+ annsfile=
215
+ './data/seqtr_type/annotations/refcocoplus-unc/instances_withid.json',
216
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
217
+ pipeline=[
218
+ dict(
219
+ type='LoadImageAnnotationsFromFile_TO',
220
+ max_token=20,
221
+ with_mask=True,
222
+ with_bbox=True,
223
+ dataset='RefCOCOPlusUNC',
224
+ use_token_type='beit3',
225
+ refer_file=
226
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
227
+ object_area_filter=100,
228
+ object_area_rate_filter=[0.05, 0.8]),
229
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
230
+ dict(
231
+ type='Normalize',
232
+ mean=[123.675, 116.28, 103.53],
233
+ std=[58.395, 57.12, 57.375]),
234
+ dict(type='DefaultFormatBundle'),
235
+ dict(
236
+ type='CollectData',
237
+ keys=[
238
+ 'img', 'ref_expr_inds', 'text_attention_mask',
239
+ 'gt_mask_rle', 'gt_bbox'
240
+ ],
241
+ meta_keys=[
242
+ 'filename', 'expression', 'ori_shape', 'img_shape',
243
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
244
+ 'empty', 'refer_target_index'
245
+ ])
246
+ ],
247
+ word_emb_cfg=dict(type='GloVe')))
248
+ ema = False
249
+ ema_factor = 0.999
250
+ use_fp16 = False
251
+ seed = 6666
252
+ deterministic = True
253
+ log_level = 'INFO'
254
+ log_interval = 50
255
+ save_interval = -1
256
+ resume_from = None
257
+ load_from = 'work_dir/refcoco+/PropVG-refcoco+.pth'
258
+ finetune_from = None
259
+ evaluate_interval = 1
260
+ start_evaluate_epoch = 0
261
+ start_save_checkpoint = 20
262
+ max_token = 20
263
+ img_size = 384
264
+ patch_size = 16
265
+ model = dict(
266
+ type='MIXRefUniModel_OMG',
267
+ vis_enc=dict(
268
+ type='BEIT3',
269
+ img_size=384,
270
+ patch_size=16,
271
+ vit_type='base',
272
+ drop_path_rate=0.1,
273
+ vocab_size=64010,
274
+ freeze_layer=-1,
275
+ vision_embed_proj_interpolate=False,
276
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
277
+ lan_enc=None,
278
+ fusion=None,
279
+ head=dict(
280
+ type='REFHead',
281
+ input_channels=768,
282
+ hidden_channels=256,
283
+ num_queries=20,
284
+ detr_loss=dict(
285
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
286
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
287
+ loss_weight=dict(
288
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
289
+ bbox=0.1,
290
+ allbbox=0.1,
291
+ refer=1.0),
292
+ MTD=dict(K=100)),
293
+ post_params=dict(
294
+ score_weighted=False,
295
+ mask_threshold=0.5,
296
+ score_threshold=0.7,
297
+ with_nms=False,
298
+ with_mask=True),
299
+ process_visual=True,
300
+ visualize_params=dict(row_columns=(4, 5)),
301
+ visual_mode='test')
302
+ grad_norm_clip = 0.15
303
+ lr = 0.0005
304
+ optimizer_config = dict(
305
+ type='Adam',
306
+ lr=0.0005,
307
+ lr_vis_enc=5e-05,
308
+ lr_lan_enc=0.0005,
309
+ betas=(0.9, 0.98),
310
+ eps=1e-09,
311
+ weight_decay=0,
312
+ amsgrad=True)
313
+ scheduler_config = dict(
314
+ type='MultiStepLRWarmUp',
315
+ warmup_epochs=1,
316
+ decay_steps=[21, 27],
317
+ decay_ratio=0.1,
318
+ max_epoch=30)
319
+ launcher = 'pytorch'
320
+ distributed = True
321
+ rank = 0
322
+ world_size = 4
323
+
324
+ 2025-07-07 11:09:07,978 - PropVG - INFO - RefCOCOPlusUNC-val size: 10758
325
+ 2025-07-07 11:09:13,867 - PropVG - INFO - RefCOCOPlusUNC-testA size: 5726
326
+ 2025-07-07 11:09:19,990 - PropVG - INFO - RefCOCOPlusUNC-testB size: 4889
327
+ 2025-07-07 11:09:24,879 - PropVG - INFO - loaded checkpoint from work_dir/refcoco+/PropVG-refcoco+.pth
328
+
329
+ 2025-07-07 11:09:24,886 - PropVG - INFO - PropVG - evaluating set val
330
+ 2025-07-07 11:11:17,140 - PropVG - INFO - ------------ validate ------------ time: 112.25, DetACC: 83.73, mIoU: 72.94, oIoU: 70.24, MaskACC@0.5-0.9: [83.12, 80.60, 76.04, 65.37, 33.26]DetACC@0.5-0.9: [83.73, 81.30, 77.10, 68.58, 42.65]
331
+ 2025-07-07 11:11:18,910 - PropVG - INFO - PropVG - evaluating set testA
332
+ 2025-07-07 11:12:32,835 - PropVG - INFO - ------------ validate ------------ time: 73.92, DetACC: 88.01, mIoU: 76.49, oIoU: 74.32, MaskACC@0.5-0.9: [88.04, 86.00, 81.37, 70.53, 33.52]DetACC@0.5-0.9: [88.01, 85.91, 82.12, 73.80, 47.14]
333
+ 2025-07-07 11:12:34,541 - PropVG - INFO - PropVG - evaluating set testB
334
+ 2025-07-07 11:13:39,576 - PropVG - INFO - ------------ validate ------------ time: 65.03, DetACC: 76.59, mIoU: 67.21, oIoU: 63.41, MaskACC@0.5-0.9: [75.57, 71.83, 66.95, 57.38, 33.87]DetACC@0.5-0.9: [76.59, 73.26, 68.11, 59.24, 36.12]
335
+ 2025-07-07 11:13:41,507 - PropVG - INFO - sucessfully save the results to work_dir/refcoco+/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!
refcoco-mix/PropVG-refcoco-mix.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78ae3b32e6ea3c4bbfc84faaa46e50f80c3076175d2b8c346497e19bcd2fffd9
3
+ size 987636053
refcoco-mix/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED
Binary file (5.55 kB). View file
 
refcoco-mix/test_log.txt ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:27:50,676 - PropVG - INFO - dataset = 'MixedSeg'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFile_TO',
8
+ max_token=20,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='MixedSeg',
12
+ use_token_type='beit3',
13
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFile_TO',
37
+ max_token=20,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='MixedSeg',
41
+ use_token_type='beit3',
42
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFile_TO',
66
+ max_token=20,
67
+ with_mask=True,
68
+ with_bbox=True,
69
+ dataset='MixedSeg',
70
+ use_token_type='beit3',
71
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
72
+ object_area_filter=100,
73
+ object_area_rate_filter=[0.05, 0.8]),
74
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
75
+ dict(
76
+ type='Normalize',
77
+ mean=[123.675, 116.28, 103.53],
78
+ std=[58.395, 57.12, 57.375]),
79
+ dict(type='DefaultFormatBundle'),
80
+ dict(
81
+ type='CollectData',
82
+ keys=[
83
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
84
+ 'gt_bbox'
85
+ ],
86
+ meta_keys=[
87
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
88
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
89
+ 'refer_target_index'
90
+ ])
91
+ ]
92
+ word_emb_cfg = dict(type='GloVe')
93
+ data = dict(
94
+ samples_per_gpu=8,
95
+ workers_per_gpu=4,
96
+ train=dict(
97
+ type='MixedSeg',
98
+ which_set='train',
99
+ img_source=['coco'],
100
+ annsfile=
101
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
102
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
103
+ pipeline=[
104
+ dict(
105
+ type='LoadImageAnnotationsFromFile_TO',
106
+ max_token=20,
107
+ with_mask=True,
108
+ with_bbox=True,
109
+ dataset='MixedSeg',
110
+ use_token_type='beit3',
111
+ refer_file=
112
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
113
+ object_area_filter=100,
114
+ object_area_rate_filter=[0.05, 0.8]),
115
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
116
+ dict(
117
+ type='Normalize',
118
+ mean=[123.675, 116.28, 103.53],
119
+ std=[58.395, 57.12, 57.375]),
120
+ dict(type='DefaultFormatBundle'),
121
+ dict(
122
+ type='CollectData',
123
+ keys=[
124
+ 'img', 'ref_expr_inds', 'text_attention_mask',
125
+ 'gt_mask_rle', 'gt_bbox'
126
+ ],
127
+ meta_keys=[
128
+ 'filename', 'expression', 'ori_shape', 'img_shape',
129
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
130
+ 'empty', 'refer_target_index'
131
+ ])
132
+ ],
133
+ word_emb_cfg=dict(type='GloVe')),
134
+ val_refcoco_unc=dict(
135
+ type='MixedSeg',
136
+ which_set='val_refcoco_unc',
137
+ img_source=['coco'],
138
+ annsfile=
139
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
140
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
141
+ pipeline=[
142
+ dict(
143
+ type='LoadImageAnnotationsFromFile_TO',
144
+ max_token=20,
145
+ with_mask=True,
146
+ with_bbox=True,
147
+ dataset='MixedSeg',
148
+ use_token_type='beit3',
149
+ refer_file=
150
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
151
+ object_area_filter=100,
152
+ object_area_rate_filter=[0.05, 0.8]),
153
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
154
+ dict(
155
+ type='Normalize',
156
+ mean=[123.675, 116.28, 103.53],
157
+ std=[58.395, 57.12, 57.375]),
158
+ dict(type='DefaultFormatBundle'),
159
+ dict(
160
+ type='CollectData',
161
+ keys=[
162
+ 'img', 'ref_expr_inds', 'text_attention_mask',
163
+ 'gt_mask_rle', 'gt_bbox'
164
+ ],
165
+ meta_keys=[
166
+ 'filename', 'expression', 'ori_shape', 'img_shape',
167
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
168
+ 'empty', 'refer_target_index'
169
+ ])
170
+ ],
171
+ word_emb_cfg=dict(type='GloVe')),
172
+ testA_refcoco_unc=dict(
173
+ type='MixedSeg',
174
+ which_set='testA_refcoco_unc',
175
+ img_source=['coco'],
176
+ annsfile=
177
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
178
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
179
+ pipeline=[
180
+ dict(
181
+ type='LoadImageAnnotationsFromFile_TO',
182
+ max_token=20,
183
+ with_mask=True,
184
+ with_bbox=True,
185
+ dataset='MixedSeg',
186
+ use_token_type='beit3',
187
+ refer_file=
188
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
189
+ object_area_filter=100,
190
+ object_area_rate_filter=[0.05, 0.8]),
191
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
192
+ dict(
193
+ type='Normalize',
194
+ mean=[123.675, 116.28, 103.53],
195
+ std=[58.395, 57.12, 57.375]),
196
+ dict(type='DefaultFormatBundle'),
197
+ dict(
198
+ type='CollectData',
199
+ keys=[
200
+ 'img', 'ref_expr_inds', 'text_attention_mask',
201
+ 'gt_mask_rle', 'gt_bbox'
202
+ ],
203
+ meta_keys=[
204
+ 'filename', 'expression', 'ori_shape', 'img_shape',
205
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
206
+ 'empty', 'refer_target_index'
207
+ ])
208
+ ],
209
+ word_emb_cfg=dict(type='GloVe')),
210
+ testB_refcoco_unc=dict(
211
+ type='MixedSeg',
212
+ which_set='testB_refcoco_unc',
213
+ img_source=['coco'],
214
+ annsfile=
215
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
216
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
217
+ pipeline=[
218
+ dict(
219
+ type='LoadImageAnnotationsFromFile_TO',
220
+ max_token=20,
221
+ with_mask=True,
222
+ with_bbox=True,
223
+ dataset='MixedSeg',
224
+ use_token_type='beit3',
225
+ refer_file=
226
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
227
+ object_area_filter=100,
228
+ object_area_rate_filter=[0.05, 0.8]),
229
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
230
+ dict(
231
+ type='Normalize',
232
+ mean=[123.675, 116.28, 103.53],
233
+ std=[58.395, 57.12, 57.375]),
234
+ dict(type='DefaultFormatBundle'),
235
+ dict(
236
+ type='CollectData',
237
+ keys=[
238
+ 'img', 'ref_expr_inds', 'text_attention_mask',
239
+ 'gt_mask_rle', 'gt_bbox'
240
+ ],
241
+ meta_keys=[
242
+ 'filename', 'expression', 'ori_shape', 'img_shape',
243
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
244
+ 'empty', 'refer_target_index'
245
+ ])
246
+ ],
247
+ word_emb_cfg=dict(type='GloVe')),
248
+ val_refcocoplus_unc=dict(
249
+ type='MixedSeg',
250
+ which_set='val_refcocoplus_unc',
251
+ img_source=['coco'],
252
+ annsfile=
253
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
254
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
255
+ pipeline=[
256
+ dict(
257
+ type='LoadImageAnnotationsFromFile_TO',
258
+ max_token=20,
259
+ with_mask=True,
260
+ with_bbox=True,
261
+ dataset='MixedSeg',
262
+ use_token_type='beit3',
263
+ refer_file=
264
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
265
+ object_area_filter=100,
266
+ object_area_rate_filter=[0.05, 0.8]),
267
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
268
+ dict(
269
+ type='Normalize',
270
+ mean=[123.675, 116.28, 103.53],
271
+ std=[58.395, 57.12, 57.375]),
272
+ dict(type='DefaultFormatBundle'),
273
+ dict(
274
+ type='CollectData',
275
+ keys=[
276
+ 'img', 'ref_expr_inds', 'text_attention_mask',
277
+ 'gt_mask_rle', 'gt_bbox'
278
+ ],
279
+ meta_keys=[
280
+ 'filename', 'expression', 'ori_shape', 'img_shape',
281
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
282
+ 'empty', 'refer_target_index'
283
+ ])
284
+ ],
285
+ word_emb_cfg=dict(type='GloVe')),
286
+ testA_refcocoplus_unc=dict(
287
+ type='MixedSeg',
288
+ which_set='testA_refcocoplus_unc',
289
+ img_source=['coco'],
290
+ annsfile=
291
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
292
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
293
+ pipeline=[
294
+ dict(
295
+ type='LoadImageAnnotationsFromFile_TO',
296
+ max_token=20,
297
+ with_mask=True,
298
+ with_bbox=True,
299
+ dataset='MixedSeg',
300
+ use_token_type='beit3',
301
+ refer_file=
302
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
303
+ object_area_filter=100,
304
+ object_area_rate_filter=[0.05, 0.8]),
305
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
306
+ dict(
307
+ type='Normalize',
308
+ mean=[123.675, 116.28, 103.53],
309
+ std=[58.395, 57.12, 57.375]),
310
+ dict(type='DefaultFormatBundle'),
311
+ dict(
312
+ type='CollectData',
313
+ keys=[
314
+ 'img', 'ref_expr_inds', 'text_attention_mask',
315
+ 'gt_mask_rle', 'gt_bbox'
316
+ ],
317
+ meta_keys=[
318
+ 'filename', 'expression', 'ori_shape', 'img_shape',
319
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
320
+ 'empty', 'refer_target_index'
321
+ ])
322
+ ],
323
+ word_emb_cfg=dict(type='GloVe')),
324
+ testB_refcocoplus_unc=dict(
325
+ type='MixedSeg',
326
+ which_set='testB_refcocoplus_unc',
327
+ img_source=['coco'],
328
+ annsfile=
329
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
330
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
331
+ pipeline=[
332
+ dict(
333
+ type='LoadImageAnnotationsFromFile_TO',
334
+ max_token=20,
335
+ with_mask=True,
336
+ with_bbox=True,
337
+ dataset='MixedSeg',
338
+ use_token_type='beit3',
339
+ refer_file=
340
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
341
+ object_area_filter=100,
342
+ object_area_rate_filter=[0.05, 0.8]),
343
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
344
+ dict(
345
+ type='Normalize',
346
+ mean=[123.675, 116.28, 103.53],
347
+ std=[58.395, 57.12, 57.375]),
348
+ dict(type='DefaultFormatBundle'),
349
+ dict(
350
+ type='CollectData',
351
+ keys=[
352
+ 'img', 'ref_expr_inds', 'text_attention_mask',
353
+ 'gt_mask_rle', 'gt_bbox'
354
+ ],
355
+ meta_keys=[
356
+ 'filename', 'expression', 'ori_shape', 'img_shape',
357
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
358
+ 'empty', 'refer_target_index'
359
+ ])
360
+ ],
361
+ word_emb_cfg=dict(type='GloVe')),
362
+ val_refcocog_umd=dict(
363
+ type='MixedSeg',
364
+ which_set='val_refcocog_umd',
365
+ img_source=['coco'],
366
+ annsfile=
367
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
368
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
369
+ pipeline=[
370
+ dict(
371
+ type='LoadImageAnnotationsFromFile_TO',
372
+ max_token=20,
373
+ with_mask=True,
374
+ with_bbox=True,
375
+ dataset='MixedSeg',
376
+ use_token_type='beit3',
377
+ refer_file=
378
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
379
+ object_area_filter=100,
380
+ object_area_rate_filter=[0.05, 0.8]),
381
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
382
+ dict(
383
+ type='Normalize',
384
+ mean=[123.675, 116.28, 103.53],
385
+ std=[58.395, 57.12, 57.375]),
386
+ dict(type='DefaultFormatBundle'),
387
+ dict(
388
+ type='CollectData',
389
+ keys=[
390
+ 'img', 'ref_expr_inds', 'text_attention_mask',
391
+ 'gt_mask_rle', 'gt_bbox'
392
+ ],
393
+ meta_keys=[
394
+ 'filename', 'expression', 'ori_shape', 'img_shape',
395
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
396
+ 'empty', 'refer_target_index'
397
+ ])
398
+ ],
399
+ word_emb_cfg=dict(type='GloVe')),
400
+ test_refcocog_umd=dict(
401
+ type='MixedSeg',
402
+ which_set='test_refcocog_umd',
403
+ img_source=['coco'],
404
+ annsfile=
405
+ './data/seqtr_type/annotations/mixed-seg/instances_nogoogle_withid.json',
406
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
407
+ pipeline=[
408
+ dict(
409
+ type='LoadImageAnnotationsFromFile_TO',
410
+ max_token=20,
411
+ with_mask=True,
412
+ with_bbox=True,
413
+ dataset='MixedSeg',
414
+ use_token_type='beit3',
415
+ refer_file=
416
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
417
+ object_area_filter=100,
418
+ object_area_rate_filter=[0.05, 0.8]),
419
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
420
+ dict(
421
+ type='Normalize',
422
+ mean=[123.675, 116.28, 103.53],
423
+ std=[58.395, 57.12, 57.375]),
424
+ dict(type='DefaultFormatBundle'),
425
+ dict(
426
+ type='CollectData',
427
+ keys=[
428
+ 'img', 'ref_expr_inds', 'text_attention_mask',
429
+ 'gt_mask_rle', 'gt_bbox'
430
+ ],
431
+ meta_keys=[
432
+ 'filename', 'expression', 'ori_shape', 'img_shape',
433
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
434
+ 'empty', 'refer_target_index'
435
+ ])
436
+ ],
437
+ word_emb_cfg=dict(type='GloVe')))
438
+ ema = False
439
+ ema_factor = 0.999
440
+ use_fp16 = False
441
+ seed = 6666
442
+ deterministic = True
443
+ log_level = 'INFO'
444
+ log_interval = 50
445
+ save_interval = -1
446
+ resume_from = None
447
+ load_from = 'work_dir/refcoco-mix/PropVG-refcoco-mix.pth'
448
+ finetune_from = None
449
+ evaluate_interval = 1
450
+ start_evaluate_epoch = 0
451
+ start_save_checkpoint = 20
452
+ max_token = 20
453
+ img_size = 384
454
+ patch_size = 16
455
+ model = dict(
456
+ type='MIXRefUniModel_OMG',
457
+ vis_enc=dict(
458
+ type='BEIT3',
459
+ img_size=384,
460
+ patch_size=16,
461
+ vit_type='base',
462
+ drop_path_rate=0.1,
463
+ vocab_size=64010,
464
+ freeze_layer=-1,
465
+ vision_embed_proj_interpolate=False,
466
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
467
+ lan_enc=None,
468
+ fusion=None,
469
+ head=dict(
470
+ type='REFHead',
471
+ input_channels=768,
472
+ hidden_channels=256,
473
+ num_queries=20,
474
+ detr_loss=dict(
475
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
476
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
477
+ loss_weight=dict(
478
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
479
+ bbox=0.1,
480
+ allbbox=0.1,
481
+ refer=1.0),
482
+ MTD=dict(K=100)),
483
+ post_params=dict(
484
+ score_weighted=False,
485
+ mask_threshold=0.5,
486
+ score_threshold=0.7,
487
+ with_nms=False,
488
+ with_mask=True),
489
+ process_visual=False,
490
+ visualize_params=dict(row_columns=(4, 5)),
491
+ visual_mode='test')
492
+ grad_norm_clip = 0.15
493
+ lr = 0.0005
494
+ optimizer_config = dict(
495
+ type='Adam',
496
+ lr=0.0005,
497
+ lr_vis_enc=5e-05,
498
+ lr_lan_enc=0.0005,
499
+ betas=(0.9, 0.98),
500
+ eps=1e-09,
501
+ weight_decay=0,
502
+ amsgrad=True)
503
+ scheduler_config = dict(
504
+ type='MultiStepLRWarmUp',
505
+ warmup_epochs=1,
506
+ decay_steps=[21, 27],
507
+ decay_ratio=0.1,
508
+ max_epoch=30)
509
+ launcher = 'pytorch'
510
+ distributed = True
511
+ rank = 0
512
+ world_size = 1
513
+
514
+ 2025-07-07 11:27:58,403 - PropVG - INFO - Mixed-val_refcoco_unc size: 10834
515
+ 2025-07-07 11:28:06,594 - PropVG - INFO - Mixed-testA_refcoco_unc size: 5657
516
+ 2025-07-07 11:28:15,164 - PropVG - INFO - Mixed-testB_refcoco_unc size: 5095
517
+ 2025-07-07 11:28:23,677 - PropVG - INFO - Mixed-val_refcocoplus_unc size: 10758
518
+ 2025-07-07 11:28:30,907 - PropVG - INFO - Mixed-testA_refcocoplus_unc size: 5726
519
+ 2025-07-07 11:28:38,494 - PropVG - INFO - Mixed-testB_refcocoplus_unc size: 4889
520
+ 2025-07-07 11:28:49,090 - PropVG - INFO - Mixed-val_refcocog_umd size: 4896
521
+ 2025-07-07 11:28:54,576 - PropVG - INFO - Mixed-test_refcocog_umd size: 9602
522
+ 2025-07-07 11:29:02,664 - PropVG - INFO - loaded checkpoint from work_dir/refcoco-mix/PropVG-refcoco-mix.pth
523
+
524
+ 2025-07-07 11:29:02,665 - PropVG - INFO - PropVG - evaluating set val_refcoco_unc
525
+ 2025-07-07 11:32:39,213 - PropVG - INFO - ------------ validate ------------ time: 216.54, DetACC: 92.70, mIoU: 81.96, oIoU: 81.80, MaskACC@0.5-0.9: [92.24, 90.71, 87.59, 79.79, 46.59]DetACC@0.5-0.9: [92.70, 91.43, 88.90, 83.85, 66.30]
526
+ 2025-07-07 11:32:43,474 - PropVG - INFO - PropVG - evaluating set testA_refcoco_unc
527
+ 2025-07-07 11:34:47,838 - PropVG - INFO - ------------ validate ------------ time: 124.36, DetACC: 95.07, mIoU: 83.58, oIoU: 83.74, MaskACC@0.5-0.9: [94.56, 93.48, 90.93, 82.91, 46.61]DetACC@0.5-0.9: [95.07, 93.99, 92.17, 88.17, 69.29]
528
+ 2025-07-07 11:34:53,297 - PropVG - INFO - PropVG - evaluating set testB_refcoco_unc
529
+ 2025-07-07 11:36:51,290 - PropVG - INFO - ------------ validate ------------ time: 117.99, DetACC: 89.58, mIoU: 80.02, oIoU: 79.33, MaskACC@0.5-0.9: [89.19, 86.99, 83.45, 76.76, 51.07]DetACC@0.5-0.9: [89.58, 87.56, 84.61, 79.14, 61.83]
530
+ 2025-07-07 11:36:56,652 - PropVG - INFO - PropVG - evaluating set val_refcocoplus_unc
531
+ 2025-07-07 11:40:28,540 - PropVG - INFO - ------------ validate ------------ time: 211.88, DetACC: 87.27, mIoU: 77.14, oIoU: 74.81, MaskACC@0.5-0.9: [86.67, 85.36, 82.52, 75.28, 44.34]DetACC@0.5-0.9: [87.27, 86.30, 84.09, 79.64, 63.62]
532
+ 2025-07-07 11:40:36,392 - PropVG - INFO - PropVG - evaluating set testA_refcocoplus_unc
533
+ 2025-07-07 11:42:43,800 - PropVG - INFO - ------------ validate ------------ time: 127.40, DetACC: 90.87, mIoU: 79.83, oIoU: 78.72, MaskACC@0.5-0.9: [90.13, 88.79, 86.57, 79.46, 45.04]DetACC@0.5-0.9: [90.87, 89.82, 87.81, 83.92, 66.33]
534
+ 2025-07-07 11:42:48,169 - PropVG - INFO - PropVG - evaluating set testB_refcocoplus_unc
535
+ 2025-07-07 11:44:41,261 - PropVG - INFO - ------------ validate ------------ time: 113.09, DetACC: 81.26, mIoU: 72.18, oIoU: 69.15, MaskACC@0.5-0.9: [80.18, 78.20, 74.78, 68.68, 45.88]DetACC@0.5-0.9: [81.26, 79.40, 76.95, 72.20, 56.78]
536
+ 2025-07-07 11:44:45,751 - PropVG - INFO - PropVG - evaluating set val_refcocog_umd
537
+ 2025-07-07 11:46:42,173 - PropVG - INFO - ------------ validate ------------ time: 116.42, DetACC: 88.15, mIoU: 76.97, oIoU: 75.54, MaskACC@0.5-0.9: [86.17, 83.58, 79.43, 72.16, 44.87]DetACC@0.5-0.9: [88.15, 85.97, 82.90, 78.00, 63.09]
538
+ 2025-07-07 11:46:46,257 - PropVG - INFO - PropVG - evaluating set test_refcocog_umd
539
+ 2025-07-07 11:50:06,821 - PropVG - INFO - ------------ validate ------------ time: 200.56, DetACC: 88.30, mIoU: 77.72, oIoU: 77.40, MaskACC@0.5-0.9: [87.14, 85.01, 80.84, 72.78, 45.79]DetACC@0.5-0.9: [88.30, 86.71, 83.98, 79.07, 65.00]
540
+ 2025-07-07 11:50:11,168 - PropVG - INFO - sucessfully save the results to work_dir/refcoco-mix/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!
refcoco/PropVG-refcoco.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd95a1952b4ac17c234432463e39e3eca42802ffdbbffcc787ea2034c5f1ac5b
3
+ size 987632917
refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED
Binary file (5.19 kB). View file
 
refcoco/test_log.txt ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:04:05,796 - PropVG - INFO - dataset = 'RefCOCOUNC'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFile_TO',
8
+ max_token=20,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='RefCOCOUNC',
12
+ use_token_type='beit3',
13
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFile_TO',
37
+ max_token=20,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='RefCOCOUNC',
41
+ use_token_type='beit3',
42
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFile_TO',
66
+ max_token=20,
67
+ with_mask=True,
68
+ with_bbox=True,
69
+ dataset='RefCOCOUNC',
70
+ use_token_type='beit3',
71
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
72
+ object_area_filter=100,
73
+ object_area_rate_filter=[0.05, 0.8]),
74
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
75
+ dict(
76
+ type='Normalize',
77
+ mean=[123.675, 116.28, 103.53],
78
+ std=[58.395, 57.12, 57.375]),
79
+ dict(type='DefaultFormatBundle'),
80
+ dict(
81
+ type='CollectData',
82
+ keys=[
83
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
84
+ 'gt_bbox'
85
+ ],
86
+ meta_keys=[
87
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
88
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
89
+ 'refer_target_index'
90
+ ])
91
+ ]
92
+ word_emb_cfg = dict(type='GloVe')
93
+ data = dict(
94
+ samples_per_gpu=8,
95
+ workers_per_gpu=4,
96
+ train=dict(
97
+ type='RefCOCOUNC',
98
+ which_set='train',
99
+ img_source=['coco'],
100
+ annsfile=
101
+ './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
102
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
103
+ pipeline=[
104
+ dict(
105
+ type='LoadImageAnnotationsFromFile_TO',
106
+ max_token=20,
107
+ with_mask=True,
108
+ with_bbox=True,
109
+ dataset='RefCOCOUNC',
110
+ use_token_type='beit3',
111
+ refer_file=
112
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
113
+ object_area_filter=100,
114
+ object_area_rate_filter=[0.05, 0.8]),
115
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
116
+ dict(
117
+ type='Normalize',
118
+ mean=[123.675, 116.28, 103.53],
119
+ std=[58.395, 57.12, 57.375]),
120
+ dict(type='DefaultFormatBundle'),
121
+ dict(
122
+ type='CollectData',
123
+ keys=[
124
+ 'img', 'ref_expr_inds', 'text_attention_mask',
125
+ 'gt_mask_rle', 'gt_bbox'
126
+ ],
127
+ meta_keys=[
128
+ 'filename', 'expression', 'ori_shape', 'img_shape',
129
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
130
+ 'empty', 'refer_target_index'
131
+ ])
132
+ ],
133
+ word_emb_cfg=dict(type='GloVe')),
134
+ val=dict(
135
+ type='RefCOCOUNC',
136
+ which_set='val',
137
+ img_source=['coco'],
138
+ annsfile=
139
+ './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
140
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
141
+ pipeline=[
142
+ dict(
143
+ type='LoadImageAnnotationsFromFile_TO',
144
+ max_token=20,
145
+ with_mask=True,
146
+ with_bbox=True,
147
+ dataset='RefCOCOUNC',
148
+ use_token_type='beit3',
149
+ refer_file=
150
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
151
+ object_area_filter=100,
152
+ object_area_rate_filter=[0.05, 0.8]),
153
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
154
+ dict(
155
+ type='Normalize',
156
+ mean=[123.675, 116.28, 103.53],
157
+ std=[58.395, 57.12, 57.375]),
158
+ dict(type='DefaultFormatBundle'),
159
+ dict(
160
+ type='CollectData',
161
+ keys=[
162
+ 'img', 'ref_expr_inds', 'text_attention_mask',
163
+ 'gt_mask_rle', 'gt_bbox'
164
+ ],
165
+ meta_keys=[
166
+ 'filename', 'expression', 'ori_shape', 'img_shape',
167
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
168
+ 'empty', 'refer_target_index'
169
+ ])
170
+ ],
171
+ word_emb_cfg=dict(type='GloVe')),
172
+ testA=dict(
173
+ type='RefCOCOUNC',
174
+ which_set='testA',
175
+ img_source=['coco'],
176
+ annsfile=
177
+ './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
178
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
179
+ pipeline=[
180
+ dict(
181
+ type='LoadImageAnnotationsFromFile_TO',
182
+ max_token=20,
183
+ with_mask=True,
184
+ with_bbox=True,
185
+ dataset='RefCOCOUNC',
186
+ use_token_type='beit3',
187
+ refer_file=
188
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
189
+ object_area_filter=100,
190
+ object_area_rate_filter=[0.05, 0.8]),
191
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
192
+ dict(
193
+ type='Normalize',
194
+ mean=[123.675, 116.28, 103.53],
195
+ std=[58.395, 57.12, 57.375]),
196
+ dict(type='DefaultFormatBundle'),
197
+ dict(
198
+ type='CollectData',
199
+ keys=[
200
+ 'img', 'ref_expr_inds', 'text_attention_mask',
201
+ 'gt_mask_rle', 'gt_bbox'
202
+ ],
203
+ meta_keys=[
204
+ 'filename', 'expression', 'ori_shape', 'img_shape',
205
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
206
+ 'empty', 'refer_target_index'
207
+ ])
208
+ ],
209
+ word_emb_cfg=dict(type='GloVe')),
210
+ testB=dict(
211
+ type='RefCOCOUNC',
212
+ which_set='testB',
213
+ img_source=['coco'],
214
+ annsfile=
215
+ './data/seqtr_type/annotations/refcoco-unc/instances_withid.json',
216
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
217
+ pipeline=[
218
+ dict(
219
+ type='LoadImageAnnotationsFromFile_TO',
220
+ max_token=20,
221
+ with_mask=True,
222
+ with_bbox=True,
223
+ dataset='RefCOCOUNC',
224
+ use_token_type='beit3',
225
+ refer_file=
226
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
227
+ object_area_filter=100,
228
+ object_area_rate_filter=[0.05, 0.8]),
229
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
230
+ dict(
231
+ type='Normalize',
232
+ mean=[123.675, 116.28, 103.53],
233
+ std=[58.395, 57.12, 57.375]),
234
+ dict(type='DefaultFormatBundle'),
235
+ dict(
236
+ type='CollectData',
237
+ keys=[
238
+ 'img', 'ref_expr_inds', 'text_attention_mask',
239
+ 'gt_mask_rle', 'gt_bbox'
240
+ ],
241
+ meta_keys=[
242
+ 'filename', 'expression', 'ori_shape', 'img_shape',
243
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
244
+ 'empty', 'refer_target_index'
245
+ ])
246
+ ],
247
+ word_emb_cfg=dict(type='GloVe')))
248
+ ema = False
249
+ ema_factor = 0.999
250
+ use_fp16 = False
251
+ seed = 6666
252
+ deterministic = True
253
+ log_level = 'INFO'
254
+ log_interval = 50
255
+ save_interval = -1
256
+ resume_from = None
257
+ load_from = 'work_dir/refcoco/PropVG-refcoco.pth'
258
+ finetune_from = None
259
+ evaluate_interval = 1
260
+ start_evaluate_epoch = 0
261
+ start_save_checkpoint = 20
262
+ max_token = 20
263
+ img_size = 384
264
+ patch_size = 16
265
+ model = dict(
266
+ type='MIXRefUniModel_OMG',
267
+ vis_enc=dict(
268
+ type='BEIT3',
269
+ img_size=384,
270
+ patch_size=16,
271
+ vit_type='base',
272
+ drop_path_rate=0.1,
273
+ vocab_size=64010,
274
+ freeze_layer=-1,
275
+ vision_embed_proj_interpolate=False,
276
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
277
+ lan_enc=None,
278
+ fusion=None,
279
+ head=dict(
280
+ type='REFHead',
281
+ input_channels=768,
282
+ hidden_channels=256,
283
+ num_queries=20,
284
+ detr_loss=dict(
285
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
286
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
287
+ loss_weight=dict(
288
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
289
+ bbox=0.1,
290
+ allbbox=0.1,
291
+ refer=1.0),
292
+ MTD=dict(K=100)),
293
+ post_params=dict(
294
+ score_weighted=False,
295
+ mask_threshold=0.5,
296
+ score_threshold=0.7,
297
+ with_nms=False,
298
+ with_mask=True),
299
+ process_visual=True,
300
+ visualize_params=dict(row_columns=(4, 5)),
301
+ visual_mode='test')
302
+ grad_norm_clip = 0.15
303
+ lr = 0.0005
304
+ optimizer_config = dict(
305
+ type='Adam',
306
+ lr=0.0005,
307
+ lr_vis_enc=5e-05,
308
+ lr_lan_enc=0.0005,
309
+ betas=(0.9, 0.98),
310
+ eps=1e-09,
311
+ weight_decay=0,
312
+ amsgrad=True)
313
+ scheduler_config = dict(
314
+ type='MultiStepLRWarmUp',
315
+ warmup_epochs=1,
316
+ decay_steps=[21, 27],
317
+ decay_ratio=0.1,
318
+ max_epoch=30)
319
+ launcher = 'pytorch'
320
+ distributed = True
321
+ rank = 0
322
+ world_size = 4
323
+
324
+ 2025-07-07 11:04:11,542 - PropVG - INFO - RefCOCOUNC-val size: 10834
325
+ 2025-07-07 11:04:17,084 - PropVG - INFO - RefCOCOUNC-testA size: 5657
326
+ 2025-07-07 11:04:22,843 - PropVG - INFO - RefCOCOUNC-testB size: 5095
327
+ 2025-07-07 11:04:28,381 - PropVG - INFO - loaded checkpoint from work_dir/refcoco/PropVG-refcoco.pth
328
+
329
+ 2025-07-07 11:04:28,382 - PropVG - INFO - PropVG - evaluating set val
330
+ 2025-07-07 11:06:19,535 - PropVG - INFO - ------------ validate ------------ time: 111.15, DetACC: 88.95, mIoU: 77.98, oIoU: 76.79, MaskACC@0.5-0.9: [89.14, 86.33, 81.66, 70.75, 36.42]DetACC@0.5-0.9: [88.95, 86.66, 82.62, 73.65, 47.82]
331
+ 2025-07-07 11:06:21,284 - PropVG - INFO - PropVG - evaluating set testA
332
+ 2025-07-07 11:07:29,418 - PropVG - INFO - ------------ validate ------------ time: 68.13, DetACC: 91.55, mIoU: 79.81, oIoU: 79.57, MaskACC@0.5-0.9: [91.66, 89.84, 85.42, 73.96, 36.22]DetACC@0.5-0.9: [91.55, 89.95, 85.94, 77.69, 51.57]
333
+ 2025-07-07 11:07:30,844 - PropVG - INFO - PropVG - evaluating set testB
334
+ 2025-07-07 11:08:36,434 - PropVG - INFO - ------------ validate ------------ time: 65.59, DetACC: 85.73, mIoU: 75.28, oIoU: 73.68, MaskACC@0.5-0.9: [84.95, 81.26, 76.06, 65.64, 38.81]DetACC@0.5-0.9: [85.73, 82.06, 76.20, 66.88, 41.97]
335
+ 2025-07-07 11:08:37,918 - PropVG - INFO - sucessfully save the results to work_dir/refcoco/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!
refcocog/PropVG-refcocog.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:004695f51a341fea17b4e7a7ff1186ada40745ca5d794b92f1adb4f6f55e9b76
3
+ size 987633701
refcocog/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED
Binary file (5.12 kB). View file
 
refcocog/test_log.txt ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:14:04,236 - PropVG - INFO - dataset = 'RefCOCOgUMD'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFile_TO',
8
+ max_token=20,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='RefCOCOgUMD',
12
+ use_token_type='beit3',
13
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFile_TO',
37
+ max_token=20,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='RefCOCOgUMD',
41
+ use_token_type='beit3',
42
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFile_TO',
66
+ max_token=20,
67
+ with_mask=True,
68
+ with_bbox=True,
69
+ dataset='RefCOCOgUMD',
70
+ use_token_type='beit3',
71
+ refer_file='data/seqtr_type/annotations/mixed-seg/coco_all.json',
72
+ object_area_filter=100,
73
+ object_area_rate_filter=[0.05, 0.8]),
74
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
75
+ dict(
76
+ type='Normalize',
77
+ mean=[123.675, 116.28, 103.53],
78
+ std=[58.395, 57.12, 57.375]),
79
+ dict(type='DefaultFormatBundle'),
80
+ dict(
81
+ type='CollectData',
82
+ keys=[
83
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
84
+ 'gt_bbox'
85
+ ],
86
+ meta_keys=[
87
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
88
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
89
+ 'refer_target_index'
90
+ ])
91
+ ]
92
+ word_emb_cfg = dict(type='GloVe')
93
+ data = dict(
94
+ samples_per_gpu=8,
95
+ workers_per_gpu=4,
96
+ train=dict(
97
+ type='RefCOCOgUMD',
98
+ which_set='train',
99
+ img_source=['coco'],
100
+ annsfile=
101
+ './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
102
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
103
+ pipeline=[
104
+ dict(
105
+ type='LoadImageAnnotationsFromFile_TO',
106
+ max_token=20,
107
+ with_mask=True,
108
+ with_bbox=True,
109
+ dataset='RefCOCOgUMD',
110
+ use_token_type='beit3',
111
+ refer_file=
112
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
113
+ object_area_filter=100,
114
+ object_area_rate_filter=[0.05, 0.8]),
115
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
116
+ dict(
117
+ type='Normalize',
118
+ mean=[123.675, 116.28, 103.53],
119
+ std=[58.395, 57.12, 57.375]),
120
+ dict(type='DefaultFormatBundle'),
121
+ dict(
122
+ type='CollectData',
123
+ keys=[
124
+ 'img', 'ref_expr_inds', 'text_attention_mask',
125
+ 'gt_mask_rle', 'gt_bbox'
126
+ ],
127
+ meta_keys=[
128
+ 'filename', 'expression', 'ori_shape', 'img_shape',
129
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
130
+ 'empty', 'refer_target_index'
131
+ ])
132
+ ],
133
+ word_emb_cfg=dict(type='GloVe')),
134
+ val=dict(
135
+ type='RefCOCOgUMD',
136
+ which_set='val',
137
+ img_source=['coco'],
138
+ annsfile=
139
+ './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
140
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
141
+ pipeline=[
142
+ dict(
143
+ type='LoadImageAnnotationsFromFile_TO',
144
+ max_token=20,
145
+ with_mask=True,
146
+ with_bbox=True,
147
+ dataset='RefCOCOgUMD',
148
+ use_token_type='beit3',
149
+ refer_file=
150
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
151
+ object_area_filter=100,
152
+ object_area_rate_filter=[0.05, 0.8]),
153
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
154
+ dict(
155
+ type='Normalize',
156
+ mean=[123.675, 116.28, 103.53],
157
+ std=[58.395, 57.12, 57.375]),
158
+ dict(type='DefaultFormatBundle'),
159
+ dict(
160
+ type='CollectData',
161
+ keys=[
162
+ 'img', 'ref_expr_inds', 'text_attention_mask',
163
+ 'gt_mask_rle', 'gt_bbox'
164
+ ],
165
+ meta_keys=[
166
+ 'filename', 'expression', 'ori_shape', 'img_shape',
167
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
168
+ 'empty', 'refer_target_index'
169
+ ])
170
+ ],
171
+ word_emb_cfg=dict(type='GloVe')),
172
+ test=dict(
173
+ type='RefCOCOgUMD',
174
+ which_set='test',
175
+ img_source=['coco'],
176
+ annsfile=
177
+ './data/seqtr_type/annotations/refcocog-umd/instances_withid.json',
178
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
179
+ pipeline=[
180
+ dict(
181
+ type='LoadImageAnnotationsFromFile_TO',
182
+ max_token=20,
183
+ with_mask=True,
184
+ with_bbox=True,
185
+ dataset='RefCOCOgUMD',
186
+ use_token_type='beit3',
187
+ refer_file=
188
+ 'data/seqtr_type/annotations/mixed-seg/coco_all.json',
189
+ object_area_filter=100,
190
+ object_area_rate_filter=[0.05, 0.8]),
191
+ dict(type='Resize', img_scale=(384, 384), keep_ratio=False),
192
+ dict(
193
+ type='Normalize',
194
+ mean=[123.675, 116.28, 103.53],
195
+ std=[58.395, 57.12, 57.375]),
196
+ dict(type='DefaultFormatBundle'),
197
+ dict(
198
+ type='CollectData',
199
+ keys=[
200
+ 'img', 'ref_expr_inds', 'text_attention_mask',
201
+ 'gt_mask_rle', 'gt_bbox'
202
+ ],
203
+ meta_keys=[
204
+ 'filename', 'expression', 'ori_shape', 'img_shape',
205
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
206
+ 'empty', 'refer_target_index'
207
+ ])
208
+ ],
209
+ word_emb_cfg=dict(type='GloVe')))
210
+ ema = False
211
+ ema_factor = 0.999
212
+ use_fp16 = False
213
+ seed = 6666
214
+ deterministic = True
215
+ log_level = 'INFO'
216
+ log_interval = 50
217
+ save_interval = -1
218
+ resume_from = None
219
+ load_from = 'work_dir/refcocog/PropVG-refcocog.pth'
220
+ finetune_from = None
221
+ evaluate_interval = 1
222
+ start_evaluate_epoch = 0
223
+ start_save_checkpoint = 20
224
+ max_token = 20
225
+ img_size = 384
226
+ patch_size = 16
227
+ model = dict(
228
+ type='MIXRefUniModel_OMG',
229
+ vis_enc=dict(
230
+ type='BEIT3',
231
+ img_size=384,
232
+ patch_size=16,
233
+ vit_type='base',
234
+ drop_path_rate=0.1,
235
+ vocab_size=64010,
236
+ freeze_layer=-1,
237
+ vision_embed_proj_interpolate=False,
238
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
239
+ lan_enc=None,
240
+ fusion=None,
241
+ head=dict(
242
+ type='REFHead',
243
+ input_channels=768,
244
+ hidden_channels=256,
245
+ num_queries=20,
246
+ detr_loss=dict(
247
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
248
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
249
+ loss_weight=dict(
250
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
251
+ bbox=0.1,
252
+ allbbox=0.1,
253
+ refer=1.0),
254
+ MTD=dict(K=100)),
255
+ post_params=dict(
256
+ score_weighted=False,
257
+ mask_threshold=0.5,
258
+ score_threshold=0.7,
259
+ with_nms=False,
260
+ with_mask=True),
261
+ process_visual=True,
262
+ visualize_params=dict(row_columns=(4, 5)),
263
+ visual_mode='test')
264
+ grad_norm_clip = 0.15
265
+ lr = 0.0005
266
+ optimizer_config = dict(
267
+ type='Adam',
268
+ lr=0.0005,
269
+ lr_vis_enc=5e-05,
270
+ lr_lan_enc=0.0005,
271
+ betas=(0.9, 0.98),
272
+ eps=1e-09,
273
+ weight_decay=0,
274
+ amsgrad=True)
275
+ scheduler_config = dict(
276
+ type='MultiStepLRWarmUp',
277
+ warmup_epochs=1,
278
+ decay_steps=[21, 27],
279
+ decay_ratio=0.1,
280
+ max_epoch=30)
281
+ launcher = 'pytorch'
282
+ distributed = True
283
+ rank = 0
284
+ world_size = 1
285
+
286
+ 2025-07-07 11:14:09,303 - PropVG - INFO - RefCOCOg-val size: 4896
287
+ 2025-07-07 11:14:14,811 - PropVG - INFO - RefCOCOg-test size: 9602
288
+ 2025-07-07 11:14:19,468 - PropVG - INFO - loaded checkpoint from work_dir/refcocog/PropVG-refcocog.pth
289
+
290
+ 2025-07-07 11:14:19,479 - PropVG - INFO - PropVG - evaluating set val
291
+ 2025-07-07 11:16:13,025 - PropVG - INFO - ------------ validate ------------ time: 113.54, DetACC: 83.50, mIoU: 71.34, oIoU: 69.30, MaskACC@0.5-0.9: [81.19, 77.33, 71.51, 60.15, 30.78]DetACC@0.5-0.9: [83.50, 80.09, 75.41, 66.07, 40.54]
292
+ 2025-07-07 11:16:15,090 - PropVG - INFO - PropVG - evaluating set test
293
+ 2025-07-07 11:19:29,251 - PropVG - INFO - ------------ validate ------------ time: 194.16, DetACC: 84.44, mIoU: 72.10, oIoU: 70.53, MaskACC@0.5-0.9: [82.53, 78.47, 72.66, 61.23, 30.31]DetACC@0.5-0.9: [84.44, 81.32, 76.33, 67.14, 42.69]
294
+ 2025-07-07 11:19:31,176 - PropVG - INFO - sucessfully save the results to work_dir/refcocog/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!
refzom/PropVG-refzom.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afe5112bf3f560532da5783f483bc286bfe5cf757035945b2928cebd696231e
3
+ size 987091461
refzom/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx ADDED
Binary file (5.01 kB). View file
 
refzom/test_log.txt ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:19:47,247 - PropVG - INFO - dataset = 'RefZOM'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFileGRES_TO',
8
+ max_token=50,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='RefZOM',
12
+ use_token_type='beit3',
13
+ refer_file=
14
+ '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
15
+ object_area_filter=100,
16
+ object_area_rate_filter=[0.05, 0.8]),
17
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
18
+ dict(
19
+ type='Normalize',
20
+ mean=[123.675, 116.28, 103.53],
21
+ std=[58.395, 57.12, 57.375]),
22
+ dict(type='DefaultFormatBundle'),
23
+ dict(
24
+ type='CollectData',
25
+ keys=[
26
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
27
+ 'gt_bbox'
28
+ ],
29
+ meta_keys=[
30
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
31
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
32
+ 'refer_target_index'
33
+ ])
34
+ ]
35
+ val_pipeline = [
36
+ dict(
37
+ type='LoadImageAnnotationsFromFileGRES_TO',
38
+ max_token=50,
39
+ with_mask=True,
40
+ with_bbox=True,
41
+ dataset='RefZOM',
42
+ use_token_type='beit3',
43
+ refer_file=
44
+ '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
45
+ object_area_filter=100,
46
+ object_area_rate_filter=[0.05, 0.8]),
47
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
48
+ dict(
49
+ type='Normalize',
50
+ mean=[123.675, 116.28, 103.53],
51
+ std=[58.395, 57.12, 57.375]),
52
+ dict(type='DefaultFormatBundle'),
53
+ dict(
54
+ type='CollectData',
55
+ keys=[
56
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
57
+ 'gt_bbox'
58
+ ],
59
+ meta_keys=[
60
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
61
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
62
+ 'refer_target_index'
63
+ ])
64
+ ]
65
+ test_pipeline = [
66
+ dict(
67
+ type='LoadImageAnnotationsFromFile',
68
+ max_token=20,
69
+ with_bbox=True,
70
+ dataset='RefZOM'),
71
+ dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
72
+ dict(
73
+ type='Normalize',
74
+ mean=[123.675, 116.28, 103.53],
75
+ std=[58.395, 57.12, 57.375]),
76
+ dict(type='Pad', size_divisor=32),
77
+ dict(type='DefaultFormatBundle'),
78
+ dict(type='CollectData', keys=['img', 'ref_expr_inds', 'gt_bbox'])
79
+ ]
80
+ word_emb_cfg = dict(type='GloVe')
81
+ data = dict(
82
+ samples_per_gpu=16,
83
+ workers_per_gpu=4,
84
+ train=dict(
85
+ type='RefZOM',
86
+ which_set='train',
87
+ img_source=['coco'],
88
+ annsfile='./data/seqtr_type/annotations/ref-zom/instance_withid.json',
89
+ imgsfile='./data/seqtr_type/images/mscoco/trainval2014',
90
+ pipeline=[
91
+ dict(
92
+ type='LoadImageAnnotationsFromFileGRES_TO',
93
+ max_token=50,
94
+ with_mask=True,
95
+ with_bbox=True,
96
+ dataset='RefZOM',
97
+ use_token_type='beit3',
98
+ refer_file=
99
+ '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
100
+ object_area_filter=100,
101
+ object_area_rate_filter=[0.05, 0.8]),
102
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
103
+ dict(
104
+ type='Normalize',
105
+ mean=[123.675, 116.28, 103.53],
106
+ std=[58.395, 57.12, 57.375]),
107
+ dict(type='DefaultFormatBundle'),
108
+ dict(
109
+ type='CollectData',
110
+ keys=[
111
+ 'img', 'ref_expr_inds', 'text_attention_mask',
112
+ 'gt_mask_rle', 'gt_bbox'
113
+ ],
114
+ meta_keys=[
115
+ 'filename', 'expression', 'ori_shape', 'img_shape',
116
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
117
+ 'empty', 'refer_target_index'
118
+ ])
119
+ ],
120
+ word_emb_cfg=dict(type='GloVe')),
121
+ val=dict(
122
+ type='RefZOM',
123
+ which_set='test',
124
+ img_source=['coco'],
125
+ annsfile='./data/seqtr_type/annotations/ref-zom/instance_withid.json',
126
+ imgsfile='./data/seqtr_type/images/mscoco/trainval2014',
127
+ pipeline=[
128
+ dict(
129
+ type='LoadImageAnnotationsFromFileGRES_TO',
130
+ max_token=50,
131
+ with_mask=True,
132
+ with_bbox=True,
133
+ dataset='RefZOM',
134
+ use_token_type='beit3',
135
+ refer_file=
136
+ '/home/dmmm/demo/SimVG-MTGA/data/seqtr_type/annotations/ref-zom/allobj.json',
137
+ object_area_filter=100,
138
+ object_area_rate_filter=[0.05, 0.8]),
139
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
140
+ dict(
141
+ type='Normalize',
142
+ mean=[123.675, 116.28, 103.53],
143
+ std=[58.395, 57.12, 57.375]),
144
+ dict(type='DefaultFormatBundle'),
145
+ dict(
146
+ type='CollectData',
147
+ keys=[
148
+ 'img', 'ref_expr_inds', 'text_attention_mask',
149
+ 'gt_mask_rle', 'gt_bbox'
150
+ ],
151
+ meta_keys=[
152
+ 'filename', 'expression', 'ori_shape', 'img_shape',
153
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
154
+ 'empty', 'refer_target_index'
155
+ ])
156
+ ],
157
+ word_emb_cfg=dict(type='GloVe')))
158
+ ema = False
159
+ ema_factor = 0.999
160
+ use_fp16 = False
161
+ seed = 6666
162
+ deterministic = True
163
+ log_level = 'INFO'
164
+ log_interval = 50
165
+ save_interval = -1
166
+ resume_from = None
167
+ load_from = 'work_dir/refzom/PropVG-refzom.pth'
168
+ finetune_from = None
169
+ evaluate_interval = 1
170
+ start_evaluate_epoch = 0
171
+ start_save_checkpoint = 9
172
+ max_token = 50
173
+ img_size = 320
174
+ patch_size = 16
175
+ num_queries = 20
176
+ model = dict(
177
+ type='MIXRefUniModel_OMG',
178
+ vis_enc=dict(
179
+ type='BEIT3',
180
+ img_size=320,
181
+ patch_size=16,
182
+ vit_type='base',
183
+ drop_path_rate=0.1,
184
+ vocab_size=64010,
185
+ freeze_layer=-1,
186
+ vision_embed_proj_interpolate=False,
187
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
188
+ lan_enc=None,
189
+ fusion=None,
190
+ head=dict(
191
+ type='GTMHead',
192
+ input_channels=768,
193
+ hidden_channels=256,
194
+ num_queries=20,
195
+ detr_loss=dict(
196
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
197
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
198
+ loss_weight=dict(
199
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
200
+ bbox=0.1,
201
+ allbbox=0.1,
202
+ refer=1.0),
203
+ MTD=dict(K=100)),
204
+ post_params=dict(
205
+ score_weighted=False,
206
+ mask_threshold=0.5,
207
+ score_threshold=0.7,
208
+ with_nms=False,
209
+ with_mask=True),
210
+ process_visual=True,
211
+ visualize_params=dict(row_columns=(4, 5)),
212
+ visual_mode='test')
213
+ grad_norm_clip = 0.15
214
+ lr = 0.0005
215
+ optimizer_config = dict(
216
+ type='Adam',
217
+ lr=0.0005,
218
+ lr_vis_enc=5e-05,
219
+ lr_lan_enc=0.0005,
220
+ betas=(0.9, 0.98),
221
+ eps=1e-09,
222
+ weight_decay=0,
223
+ amsgrad=True)
224
+ scheduler_config = dict(
225
+ type='MultiStepLRWarmUp',
226
+ warmup_epochs=1,
227
+ decay_steps=[7, 11],
228
+ decay_ratio=0.1,
229
+ max_epoch=12)
230
+ launcher = 'pytorch'
231
+ distributed = True
232
+ rank = 0
233
+ world_size = 4
234
+
235
+ 2025-07-07 11:19:56,830 - PropVG - INFO - RefZOM-test size: 21770
236
+ 2025-07-07 11:20:02,074 - PropVG - INFO - loaded checkpoint from work_dir/refzom/PropVG-refzom.pth
237
+
238
+ 2025-07-07 11:20:02,098 - PropVG - INFO - PropVG - evaluating set val
239
+ 2025-07-07 11:22:14,202 - PropVG - INFO - ------------ validate ------------time: 132.10, mIoU: 71.15, oIoU: 71.95, macc: 98.11, MaskACC@0.5-0.9: [81.03, 76.58, 70.25, 57.06, 25.40
240
+ 2025-07-07 11:22:15,468 - PropVG - INFO - sucessfully save the results to work_dir/refzom/refer_output_thr0.7_no-nms_no-sw_0.5_100.xlsx !!!
rrefcoco/PropVG-rrefcoco.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c03212f30621e0c364a677ad942c3498ffbab00c7851ed4faee0efb3b858371
3
+ size 987093029
rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx ADDED
Binary file (5.08 kB). View file
 
rrefcoco/test_log.txt ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 11:46:17,817 - PropVG - INFO - dataset = 'RRefCOCO'
2
+ data_root = './data/seqtr_type/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
5
+ train_pipeline = [
6
+ dict(
7
+ type='LoadImageAnnotationsFromFileGRES_TO',
8
+ max_token=50,
9
+ with_mask=True,
10
+ with_bbox=True,
11
+ dataset='RRefCOCO',
12
+ use_token_type='beit3',
13
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
14
+ object_area_filter=100,
15
+ object_area_rate_filter=[0.05, 0.8]),
16
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
17
+ dict(
18
+ type='Normalize',
19
+ mean=[123.675, 116.28, 103.53],
20
+ std=[58.395, 57.12, 57.375]),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(
23
+ type='CollectData',
24
+ keys=[
25
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
26
+ 'gt_bbox', 'gt_mask_parts_rle'
27
+ ],
28
+ meta_keys=[
29
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
30
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
31
+ 'refer_target_index', 'tokenized_words'
32
+ ])
33
+ ]
34
+ val_pipeline = [
35
+ dict(
36
+ type='LoadImageAnnotationsFromFileGRES_TO',
37
+ max_token=50,
38
+ with_mask=True,
39
+ with_bbox=True,
40
+ dataset='RRefCOCO',
41
+ use_token_type='beit3',
42
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
43
+ object_area_filter=100,
44
+ object_area_rate_filter=[0.05, 0.8]),
45
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
46
+ dict(
47
+ type='Normalize',
48
+ mean=[123.675, 116.28, 103.53],
49
+ std=[58.395, 57.12, 57.375]),
50
+ dict(type='DefaultFormatBundle'),
51
+ dict(
52
+ type='CollectData',
53
+ keys=[
54
+ 'img', 'ref_expr_inds', 'text_attention_mask', 'gt_mask_rle',
55
+ 'gt_bbox', 'gt_mask_parts_rle'
56
+ ],
57
+ meta_keys=[
58
+ 'filename', 'expression', 'ori_shape', 'img_shape', 'pad_shape',
59
+ 'scale_factor', 'gt_ori_mask', 'target', 'empty',
60
+ 'refer_target_index', 'tokenized_words'
61
+ ])
62
+ ]
63
+ test_pipeline = [
64
+ dict(
65
+ type='LoadImageAnnotationsFromFile',
66
+ max_token=20,
67
+ with_bbox=True,
68
+ dataset='RRefCOCO'),
69
+ dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
70
+ dict(
71
+ type='Normalize',
72
+ mean=[123.675, 116.28, 103.53],
73
+ std=[58.395, 57.12, 57.375]),
74
+ dict(type='Pad', size_divisor=32),
75
+ dict(type='DefaultFormatBundle'),
76
+ dict(type='CollectData', keys=['img', 'ref_expr_inds', 'gt_bbox'])
77
+ ]
78
+ word_emb_cfg = dict(type='GloVe')
79
+ data = dict(
80
+ samples_per_gpu=16,
81
+ workers_per_gpu=4,
82
+ train=dict(
83
+ type='RRefCOCO',
84
+ which_set='train',
85
+ img_source=['coco'],
86
+ annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
87
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
88
+ pipeline=[
89
+ dict(
90
+ type='LoadImageAnnotationsFromFileGRES_TO',
91
+ max_token=50,
92
+ with_mask=True,
93
+ with_bbox=True,
94
+ dataset='RRefCOCO',
95
+ use_token_type='beit3',
96
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
97
+ object_area_filter=100,
98
+ object_area_rate_filter=[0.05, 0.8]),
99
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
100
+ dict(
101
+ type='Normalize',
102
+ mean=[123.675, 116.28, 103.53],
103
+ std=[58.395, 57.12, 57.375]),
104
+ dict(type='DefaultFormatBundle'),
105
+ dict(
106
+ type='CollectData',
107
+ keys=[
108
+ 'img', 'ref_expr_inds', 'text_attention_mask',
109
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
110
+ ],
111
+ meta_keys=[
112
+ 'filename', 'expression', 'ori_shape', 'img_shape',
113
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
114
+ 'empty', 'refer_target_index', 'tokenized_words'
115
+ ])
116
+ ],
117
+ word_emb_cfg=dict(type='GloVe')),
118
+ val_rrefcoco=dict(
119
+ type='RRefCOCO',
120
+ which_set='val_rrefcoco',
121
+ img_source=['coco'],
122
+ annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
123
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
124
+ pipeline=[
125
+ dict(
126
+ type='LoadImageAnnotationsFromFileGRES_TO',
127
+ max_token=50,
128
+ with_mask=True,
129
+ with_bbox=True,
130
+ dataset='RRefCOCO',
131
+ use_token_type='beit3',
132
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
133
+ object_area_filter=100,
134
+ object_area_rate_filter=[0.05, 0.8]),
135
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
136
+ dict(
137
+ type='Normalize',
138
+ mean=[123.675, 116.28, 103.53],
139
+ std=[58.395, 57.12, 57.375]),
140
+ dict(type='DefaultFormatBundle'),
141
+ dict(
142
+ type='CollectData',
143
+ keys=[
144
+ 'img', 'ref_expr_inds', 'text_attention_mask',
145
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
146
+ ],
147
+ meta_keys=[
148
+ 'filename', 'expression', 'ori_shape', 'img_shape',
149
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
150
+ 'empty', 'refer_target_index', 'tokenized_words'
151
+ ])
152
+ ],
153
+ word_emb_cfg=dict(type='GloVe')),
154
+ val_rrefcocoplus=dict(
155
+ type='RRefCOCO',
156
+ which_set='val_rrefcoco+',
157
+ img_source=['coco'],
158
+ annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
159
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
160
+ pipeline=[
161
+ dict(
162
+ type='LoadImageAnnotationsFromFileGRES_TO',
163
+ max_token=50,
164
+ with_mask=True,
165
+ with_bbox=True,
166
+ dataset='RRefCOCO',
167
+ use_token_type='beit3',
168
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
169
+ object_area_filter=100,
170
+ object_area_rate_filter=[0.05, 0.8]),
171
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
172
+ dict(
173
+ type='Normalize',
174
+ mean=[123.675, 116.28, 103.53],
175
+ std=[58.395, 57.12, 57.375]),
176
+ dict(type='DefaultFormatBundle'),
177
+ dict(
178
+ type='CollectData',
179
+ keys=[
180
+ 'img', 'ref_expr_inds', 'text_attention_mask',
181
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
182
+ ],
183
+ meta_keys=[
184
+ 'filename', 'expression', 'ori_shape', 'img_shape',
185
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
186
+ 'empty', 'refer_target_index', 'tokenized_words'
187
+ ])
188
+ ],
189
+ word_emb_cfg=dict(type='GloVe')),
190
+ val_rrefcocog=dict(
191
+ type='RRefCOCO',
192
+ which_set='val_rrefcocog',
193
+ img_source=['coco'],
194
+ annsfile='./data/seqtr_type/annotations/rrefcoco/instance_withid.json',
195
+ imgsfile='./data/seqtr_type/images/mscoco/train2014',
196
+ pipeline=[
197
+ dict(
198
+ type='LoadImageAnnotationsFromFileGRES_TO',
199
+ max_token=50,
200
+ with_mask=True,
201
+ with_bbox=True,
202
+ dataset='RRefCOCO',
203
+ use_token_type='beit3',
204
+ refer_file='./data/seqtr_type/annotations/rrefcoco/allobj.json',
205
+ object_area_filter=100,
206
+ object_area_rate_filter=[0.05, 0.8]),
207
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
208
+ dict(
209
+ type='Normalize',
210
+ mean=[123.675, 116.28, 103.53],
211
+ std=[58.395, 57.12, 57.375]),
212
+ dict(type='DefaultFormatBundle'),
213
+ dict(
214
+ type='CollectData',
215
+ keys=[
216
+ 'img', 'ref_expr_inds', 'text_attention_mask',
217
+ 'gt_mask_rle', 'gt_bbox', 'gt_mask_parts_rle'
218
+ ],
219
+ meta_keys=[
220
+ 'filename', 'expression', 'ori_shape', 'img_shape',
221
+ 'pad_shape', 'scale_factor', 'gt_ori_mask', 'target',
222
+ 'empty', 'refer_target_index', 'tokenized_words'
223
+ ])
224
+ ],
225
+ word_emb_cfg=dict(type='GloVe')))
226
+ ema = False
227
+ ema_factor = 0.999
228
+ use_fp16 = False
229
+ seed = 6666
230
+ deterministic = True
231
+ log_level = 'INFO'
232
+ log_interval = 50
233
+ save_interval = -1
234
+ resume_from = None
235
+ load_from = 'work_dir/rrefcoco/PropVG-rrefcoco.pth'
236
+ finetune_from = None
237
+ evaluate_interval = 1
238
+ start_evaluate_epoch = 0
239
+ start_save_checkpoint = 9
240
+ max_token = 50
241
+ img_size = 320
242
+ patch_size = 16
243
+ num_queries = 20
244
+ model = dict(
245
+ type='MIXRefUniModel_OMG',
246
+ vis_enc=dict(
247
+ type='BEIT3',
248
+ img_size=320,
249
+ patch_size=16,
250
+ vit_type='base',
251
+ drop_path_rate=0.1,
252
+ vocab_size=64010,
253
+ freeze_layer=-1,
254
+ vision_embed_proj_interpolate=False,
255
+ pretrain='pretrain_weights/beit3_base_patch16_224.zip'),
256
+ lan_enc=None,
257
+ fusion=None,
258
+ head=dict(
259
+ type='GTMHead',
260
+ input_channels=768,
261
+ hidden_channels=256,
262
+ num_queries=20,
263
+ detr_loss=dict(
264
+ criterion=dict(loss_class=1.0, loss_bbox=5.0, loss_giou=2.0),
265
+ matcher=dict(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)),
266
+ loss_weight=dict(
267
+ mask=dict(dice=1.0, bce=1.0, nt=0.2, neg=0),
268
+ bbox=0.1,
269
+ allbbox=0.1,
270
+ refer=1.0),
271
+ MTD=dict(K=250)),
272
+ post_params=dict(
273
+ score_weighted=False,
274
+ mask_threshold=0.5,
275
+ score_threshold=0.7,
276
+ with_nms=False,
277
+ with_mask=True),
278
+ process_visual=True,
279
+ visualize_params=dict(row_columns=(4, 5)),
280
+ visual_mode='test')
281
+ grad_norm_clip = 0.15
282
+ lr = 0.0005
283
+ optimizer_config = dict(
284
+ type='Adam',
285
+ lr=0.0005,
286
+ lr_vis_enc=5e-05,
287
+ lr_lan_enc=0.0005,
288
+ betas=(0.9, 0.98),
289
+ eps=1e-09,
290
+ weight_decay=0,
291
+ amsgrad=True)
292
+ scheduler_config = dict(
293
+ type='MultiStepLRWarmUp',
294
+ warmup_epochs=1,
295
+ decay_steps=[7, 11],
296
+ decay_ratio=0.1,
297
+ max_epoch=12)
298
+ launcher = 'none'
299
+ distributed = False
300
+ rank = 0
301
+ world_size = 1
302
+
303
+ 2025-07-07 11:46:34,374 - PropVG - INFO - RRefCOCO-val_rrefcoco size: 52229
304
+ 2025-07-07 11:46:53,442 - PropVG - INFO - RRefCOCO-val_rrefcoco+ size: 49620
305
+ 2025-07-07 11:47:11,525 - PropVG - INFO - RRefCOCO-val_rrefcocog size: 33960
306
+ 2025-07-07 11:47:16,069 - PropVG - INFO - loaded checkpoint from work_dir/rrefcoco/PropVG-rrefcoco.pth
307
+
308
+ 2025-07-07 11:47:16,070 - PropVG - INFO - PropVG - evaluating set val_rrefcoco
309
+ 2025-07-07 11:58:15,741 - PropVG - INFO - ------------ validate ------------time: 659.65, mIoU: 75.86, oIoU: 76.87, mRR: 93.03, rIoU: 62.91
310
+ 2025-07-07 11:58:18,322 - PropVG - INFO - PropVG - evaluating set val_rrefcoco+
311
+ 2025-07-07 12:07:56,811 - PropVG - INFO - ------------ validate ------------time: 578.47, mIoU: 69.39, oIoU: 69.17, mRR: 94.96, rIoU: 59.44
312
+ 2025-07-07 12:07:58,975 - PropVG - INFO - PropVG - evaluating set val_rrefcocog
313
+ 2025-07-07 12:14:35,849 - PropVG - INFO - ------------ validate ------------time: 396.86, mIoU: 69.20, oIoU: 70.13, mRR: 93.85, rIoU: 56.17
314
+ 2025-07-07 12:14:37,866 - PropVG - INFO - sucessfully save the results to work_dir/rrefcoco/refer_output_thr0.7_no-nms_no-sw_0.5_250.xlsx !!!