tiennguyen commited on
Commit
e4c7c3b
·
1 Parent(s): 64774e4

Initial commit

Browse files
checkpoint_best_regular.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2c43f732a56467e00d9e1e7f87b83e8d3b60b681a8763200d22caa1b83080e
3
+ size 2070726177
config_args_all.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_file": "config/cfg_odvg_libero.py",
3
+ "options": {
4
+ "text_encoder_type": "./bert-base-uncased"
5
+ },
6
+ "datasets": "config/libero_vg.json",
7
+ "remove_difficult": false,
8
+ "fix_size": false,
9
+ "output_dir": "./logs",
10
+ "note": "",
11
+ "device": "cuda",
12
+ "seed": 42,
13
+ "resume": "",
14
+ "pretrain_model_path": "./groundingdino_swint_ogc.pth",
15
+ "finetune_ignore": null,
16
+ "start_epoch": 0,
17
+ "eval": false,
18
+ "num_workers": 6,
19
+ "test": false,
20
+ "debug": false,
21
+ "find_unused_params": false,
22
+ "save_results": false,
23
+ "save_log": false,
24
+ "world_size": 1,
25
+ "dist_url": "env://",
26
+ "rank": 0,
27
+ "local_rank": 0,
28
+ "amp": false,
29
+ "gpu": 0,
30
+ "distributed": true,
31
+ "dist_backend": "nccl",
32
+ "data_aug_scales": [
33
+ 192,
34
+ 208,
35
+ 224,
36
+ 240
37
+ ],
38
+ "data_aug_max_size": 240,
39
+ "data_aug_scales2_resize": [
40
+ 256
41
+ ],
42
+ "data_aug_scales2_crop": [
43
+ 224,
44
+ 224
45
+ ],
46
+ "data_aug_scale_overlap": null,
47
+ "batch_size": 55,
48
+ "modelname": "groundingdino",
49
+ "backbone": "swin_T_224_1k",
50
+ "position_embedding": "sine",
51
+ "pe_temperatureH": 20,
52
+ "pe_temperatureW": 20,
53
+ "return_interm_indices": [
54
+ 1,
55
+ 2,
56
+ 3
57
+ ],
58
+ "enc_layers": 6,
59
+ "dec_layers": 6,
60
+ "pre_norm": false,
61
+ "dim_feedforward": 2048,
62
+ "hidden_dim": 256,
63
+ "dropout": 0.0,
64
+ "nheads": 8,
65
+ "num_queries": 900,
66
+ "query_dim": 4,
67
+ "num_patterns": 0,
68
+ "num_feature_levels": 4,
69
+ "enc_n_points": 4,
70
+ "dec_n_points": 4,
71
+ "two_stage_type": "standard",
72
+ "two_stage_bbox_embed_share": false,
73
+ "two_stage_class_embed_share": false,
74
+ "transformer_activation": "relu",
75
+ "dec_pred_bbox_embed_share": true,
76
+ "dn_box_noise_scale": 1.0,
77
+ "dn_label_noise_ratio": 0.5,
78
+ "dn_label_coef": 1.0,
79
+ "dn_bbox_coef": 1.0,
80
+ "embed_init_tgt": true,
81
+ "dn_labelbook_size": 91,
82
+ "max_text_len": 256,
83
+ "text_encoder_type": "./bert-base-uncased",
84
+ "use_text_enhancer": true,
85
+ "use_fusion_layer": true,
86
+ "use_checkpoint": true,
87
+ "use_transformer_ckpt": true,
88
+ "use_text_cross_attention": true,
89
+ "text_dropout": 0.0,
90
+ "fusion_dropout": 0.0,
91
+ "fusion_droppath": 0.1,
92
+ "sub_sentence_present": true,
93
+ "max_labels": 30,
94
+ "lr": 0.001,
95
+ "backbone_freeze_keywords": null,
96
+ "freeze_keywords": [],
97
+ "lr_backbone": 1e-05,
98
+ "lr_backbone_names": [
99
+ "backbone.0",
100
+ "bert"
101
+ ],
102
+ "lr_linear_proj_mult": 1e-05,
103
+ "lr_linear_proj_names": [
104
+ "ref_point_head",
105
+ "sampling_offsets"
106
+ ],
107
+ "weight_decay": 0.0001,
108
+ "param_dict_type": "ddetr_in_mmdet",
109
+ "ddetr_lr_param": false,
110
+ "epochs": 50,
111
+ "lr_drop": 10,
112
+ "save_checkpoint_interval": 10,
113
+ "clip_max_norm": 0.1,
114
+ "onecyclelr": false,
115
+ "multi_step_lr": false,
116
+ "lr_drop_list": [
117
+ 10,
118
+ 20,
119
+ 30,
120
+ 40
121
+ ],
122
+ "frozen_weights": null,
123
+ "dilation": false,
124
+ "pdetr3_bbox_embed_diff_each_layer": false,
125
+ "pdetr3_refHW": -1,
126
+ "random_refpoints_xy": false,
127
+ "fix_refpoints_hw": -1,
128
+ "dabdetr_yolo_like_anchor_update": false,
129
+ "dabdetr_deformable_encoder": false,
130
+ "dabdetr_deformable_decoder": false,
131
+ "use_deformable_box_attn": false,
132
+ "box_attn_type": "roi_align",
133
+ "dec_layer_number": null,
134
+ "decoder_layer_noise": false,
135
+ "dln_xy_noise": 0.2,
136
+ "dln_hw_noise": 0.2,
137
+ "add_channel_attention": false,
138
+ "add_pos_value": false,
139
+ "two_stage_pat_embed": 0,
140
+ "two_stage_add_query_num": 0,
141
+ "two_stage_learn_wh": false,
142
+ "two_stage_default_hw": 0.05,
143
+ "two_stage_keep_all_tokens": false,
144
+ "num_select": 40,
145
+ "batch_norm_type": "FrozenBatchNorm2d",
146
+ "masks": false,
147
+ "aux_loss": true,
148
+ "set_cost_class": 1.0,
149
+ "set_cost_bbox": 5.0,
150
+ "set_cost_giou": 2.0,
151
+ "cls_loss_coef": 2.5,
152
+ "bbox_loss_coef": 5.0,
153
+ "giou_loss_coef": 2.0,
154
+ "enc_loss_coef": 1.0,
155
+ "interm_loss_coef": 1.0,
156
+ "no_interm_box_loss": false,
157
+ "mask_loss_coef": 1.0,
158
+ "dice_loss_coef": 1.0,
159
+ "focal_alpha": 0.25,
160
+ "focal_gamma": 2.5,
161
+ "decoder_sa_type": "sa",
162
+ "matcher_type": "HungarianMatcher",
163
+ "decoder_module_seq": [
164
+ "sa",
165
+ "ca",
166
+ "ffn"
167
+ ],
168
+ "nms_iou_threshold": -1,
169
+ "dec_pred_class_embed_share": true,
170
+ "match_unstable_error": true,
171
+ "use_ema": true,
172
+ "ema_decay": 0.9997,
173
+ "ema_epoch": 0,
174
+ "use_detached_boxes_dec_out": false,
175
+ "use_coco_eval": false,
176
+ "dn_scalar": 100,
177
+ "label_list": [
178
+ "alphabet soup",
179
+ "basket",
180
+ "bbq sauce",
181
+ "black bowl",
182
+ "book",
183
+ "butter",
184
+ "cabinet",
185
+ "caddy",
186
+ "chocolate pudding",
187
+ "cream cheese",
188
+ "gripper",
189
+ "ketchup",
190
+ "left moka pot",
191
+ "left plate",
192
+ "microwave",
193
+ "milk",
194
+ "moka pot",
195
+ "orange juice",
196
+ "plate",
197
+ "right moka pot",
198
+ "right plate",
199
+ "salad dressing",
200
+ "stove",
201
+ "tomato sauce",
202
+ "white mug",
203
+ "wine bottle",
204
+ "wine rack",
205
+ "yellow and white mug"
206
+ ]
207
+ }
config_args_raw.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_file": "config/cfg_odvg_libero.py",
3
+ "options": {
4
+ "text_encoder_type": "./bert-base-uncased"
5
+ },
6
+ "datasets": "config/libero_vg.json",
7
+ "remove_difficult": false,
8
+ "fix_size": false,
9
+ "output_dir": "./logs",
10
+ "note": "",
11
+ "device": "cuda",
12
+ "seed": 42,
13
+ "resume": "",
14
+ "pretrain_model_path": "./groundingdino_swint_ogc.pth",
15
+ "finetune_ignore": null,
16
+ "start_epoch": 0,
17
+ "eval": false,
18
+ "num_workers": 6,
19
+ "test": false,
20
+ "debug": false,
21
+ "find_unused_params": false,
22
+ "save_results": false,
23
+ "save_log": false,
24
+ "world_size": 1,
25
+ "dist_url": "env://",
26
+ "rank": 0,
27
+ "local_rank": 0,
28
+ "amp": false,
29
+ "gpu": 0,
30
+ "distributed": true,
31
+ "dist_backend": "nccl"
32
+ }
config_cfg.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_aug_scales = [192, 208, 224, 240]
2
+ data_aug_max_size = 240
3
+ data_aug_scales2_resize = [256]
4
+ data_aug_scales2_crop = [224, 224]
5
+ data_aug_scale_overlap = None
6
+ batch_size = 55
7
+ modelname = 'groundingdino'
8
+ backbone = 'swin_T_224_1k'
9
+ position_embedding = 'sine'
10
+ pe_temperatureH = 20
11
+ pe_temperatureW = 20
12
+ return_interm_indices = [1, 2, 3]
13
+ enc_layers = 6
14
+ dec_layers = 6
15
+ pre_norm = False
16
+ dim_feedforward = 2048
17
+ hidden_dim = 256
18
+ dropout = 0.0
19
+ nheads = 8
20
+ num_queries = 900
21
+ query_dim = 4
22
+ num_patterns = 0
23
+ num_feature_levels = 4
24
+ enc_n_points = 4
25
+ dec_n_points = 4
26
+ two_stage_type = 'standard'
27
+ two_stage_bbox_embed_share = False
28
+ two_stage_class_embed_share = False
29
+ transformer_activation = 'relu'
30
+ dec_pred_bbox_embed_share = True
31
+ dn_box_noise_scale = 1.0
32
+ dn_label_noise_ratio = 0.5
33
+ dn_label_coef = 1.0
34
+ dn_bbox_coef = 1.0
35
+ embed_init_tgt = True
36
+ dn_labelbook_size = 91
37
+ max_text_len = 256
38
+ text_encoder_type = './bert-base-uncased'
39
+ use_text_enhancer = True
40
+ use_fusion_layer = True
41
+ use_checkpoint = True
42
+ use_transformer_ckpt = True
43
+ use_text_cross_attention = True
44
+ text_dropout = 0.0
45
+ fusion_dropout = 0.0
46
+ fusion_droppath = 0.1
47
+ sub_sentence_present = True
48
+ max_labels = 30
49
+ lr = 0.001
50
+ backbone_freeze_keywords = None
51
+ freeze_keywords = []
52
+ lr_backbone = 1e-05
53
+ lr_backbone_names = ['backbone.0', 'bert']
54
+ lr_linear_proj_mult = 1e-05
55
+ lr_linear_proj_names = ['ref_point_head', 'sampling_offsets']
56
+ weight_decay = 0.0001
57
+ param_dict_type = 'ddetr_in_mmdet'
58
+ ddetr_lr_param = False
59
+ epochs = 50
60
+ lr_drop = 10
61
+ save_checkpoint_interval = 10
62
+ clip_max_norm = 0.1
63
+ onecyclelr = False
64
+ multi_step_lr = False
65
+ lr_drop_list = [10, 20, 30, 40]
66
+ frozen_weights = None
67
+ dilation = False
68
+ pdetr3_bbox_embed_diff_each_layer = False
69
+ pdetr3_refHW = -1
70
+ random_refpoints_xy = False
71
+ fix_refpoints_hw = -1
72
+ dabdetr_yolo_like_anchor_update = False
73
+ dabdetr_deformable_encoder = False
74
+ dabdetr_deformable_decoder = False
75
+ use_deformable_box_attn = False
76
+ box_attn_type = 'roi_align'
77
+ dec_layer_number = None
78
+ decoder_layer_noise = False
79
+ dln_xy_noise = 0.2
80
+ dln_hw_noise = 0.2
81
+ add_channel_attention = False
82
+ add_pos_value = False
83
+ two_stage_pat_embed = 0
84
+ two_stage_add_query_num = 0
85
+ two_stage_learn_wh = False
86
+ two_stage_default_hw = 0.05
87
+ two_stage_keep_all_tokens = False
88
+ num_select = 40
89
+ batch_norm_type = 'FrozenBatchNorm2d'
90
+ masks = False
91
+ aux_loss = True
92
+ set_cost_class = 1.0
93
+ set_cost_bbox = 5.0
94
+ set_cost_giou = 2.0
95
+ cls_loss_coef = 2.5
96
+ bbox_loss_coef = 5.0
97
+ giou_loss_coef = 2.0
98
+ enc_loss_coef = 1.0
99
+ interm_loss_coef = 1.0
100
+ no_interm_box_loss = False
101
+ mask_loss_coef = 1.0
102
+ dice_loss_coef = 1.0
103
+ focal_alpha = 0.25
104
+ focal_gamma = 2.5
105
+ decoder_sa_type = 'sa'
106
+ matcher_type = 'HungarianMatcher'
107
+ decoder_module_seq = ['sa', 'ca', 'ffn']
108
+ nms_iou_threshold = -1
109
+ dec_pred_class_embed_share = True
110
+ match_unstable_error = True
111
+ use_ema = True
112
+ ema_decay = 0.9997
113
+ ema_epoch = 0
114
+ use_detached_boxes_dec_out = False
115
+ use_coco_eval = False
116
+ dn_scalar = 100
117
+ label_list = [
118
+ 'alphabet soup', 'basket', 'bbq sauce', 'black bowl', 'book', 'butter',
119
+ 'cabinet', 'caddy', 'chocolate pudding', 'cream cheese', 'gripper',
120
+ 'ketchup', 'left moka pot', 'left plate', 'microwave', 'milk', 'moka pot',
121
+ 'orange juice', 'plate', 'right moka pot', 'right plate', 'salad dressing',
122
+ 'stove', 'tomato sauce', 'white mug', 'wine bottle', 'wine rack',
123
+ 'yellow and white mug'
124
+ ]
eval/000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f62bc22f5cf10bedd33108439618b479bc3e85d7fe9887237ef982c4b391e2
3
+ size 5934741
eval/latest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f114726fc6442b4e6c2f09e700dabf1288d9fb41babc6fe2d58f62783be85ebb
3
+ size 6064615
info.txt ADDED
The diff for this file is too large to render. See raw diff
 
log.txt ADDED
The diff for this file is too large to render. See raw diff