File size: 4,346 Bytes
6fd772e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
{
  "config_file": "config/cfg_odvg_calvin.py",
  "options": {
    "text_encoder_type": "./bert-base-uncased"
  },
  "datasets": "config/calvin_vg.json",
  "remove_difficult": false,
  "fix_size": false,
  "output_dir": "./logs_calvin",
  "note": "",
  "device": "cuda",
  "seed": 42,
  "resume": "",
  "pretrain_model_path": "./groundingdino_swint_ogc.pth",
  "finetune_ignore": null,
  "start_epoch": 0,
  "eval": false,
  "num_workers": 6,
  "test": false,
  "debug": false,
  "find_unused_params": false,
  "save_results": false,
  "save_log": false,
  "world_size": 1,
  "dist_url": "env://",
  "rank": 0,
  "local_rank": 0,
  "amp": false,
  "gpu": 0,
  "distributed": true,
  "dist_backend": "nccl",
  "data_aug_scales": [
    160,
    192,
    200,
    224,
    256
  ],
  "data_aug_max_size": 256,
  "data_aug_scales2_resize": [
    224
  ],
  "data_aug_scales2_crop": [
    224,
    224
  ],
  "data_aug_scale_overlap": null,
  "batch_size": 55,
  "modelname": "groundingdino",
  "backbone": "swin_T_224_1k",
  "position_embedding": "sine",
  "pe_temperatureH": 20,
  "pe_temperatureW": 20,
  "return_interm_indices": [
    1,
    2,
    3
  ],
  "enc_layers": 6,
  "dec_layers": 6,
  "pre_norm": false,
  "dim_feedforward": 2048,
  "hidden_dim": 256,
  "dropout": 0.0,
  "nheads": 8,
  "num_queries": 900,
  "query_dim": 4,
  "num_patterns": 0,
  "num_feature_levels": 4,
  "enc_n_points": 4,
  "dec_n_points": 4,
  "two_stage_type": "standard",
  "two_stage_bbox_embed_share": false,
  "two_stage_class_embed_share": false,
  "transformer_activation": "relu",
  "dec_pred_bbox_embed_share": true,
  "dn_box_noise_scale": 1.0,
  "dn_label_noise_ratio": 0.5,
  "dn_label_coef": 1.0,
  "dn_bbox_coef": 1.0,
  "embed_init_tgt": true,
  "dn_labelbook_size": 91,
  "max_text_len": 256,
  "text_encoder_type": "./bert-base-uncased",
  "use_text_enhancer": true,
  "use_fusion_layer": true,
  "use_checkpoint": true,
  "use_transformer_ckpt": true,
  "use_text_cross_attention": true,
  "text_dropout": 0.0,
  "fusion_dropout": 0.0,
  "fusion_droppath": 0.1,
  "sub_sentence_present": true,
  "max_labels": 30,
  "lr": 0.001,
  "backbone_freeze_keywords": null,
  "freeze_keywords": [],
  "lr_backbone": 1e-05,
  "lr_backbone_names": [
    "backbone.0",
    "bert"
  ],
  "lr_linear_proj_mult": 1e-05,
  "lr_linear_proj_names": [
    "ref_point_head",
    "sampling_offsets"
  ],
  "weight_decay": 0.0001,
  "param_dict_type": "ddetr_in_mmdet",
  "ddetr_lr_param": false,
  "epochs": 50,
  "lr_drop": 10,
  "save_checkpoint_interval": 10,
  "clip_max_norm": 0.1,
  "onecyclelr": false,
  "multi_step_lr": false,
  "lr_drop_list": [
    10,
    20,
    30,
    40
  ],
  "frozen_weights": null,
  "dilation": false,
  "pdetr3_bbox_embed_diff_each_layer": false,
  "pdetr3_refHW": -1,
  "random_refpoints_xy": false,
  "fix_refpoints_hw": -1,
  "dabdetr_yolo_like_anchor_update": false,
  "dabdetr_deformable_encoder": false,
  "dabdetr_deformable_decoder": false,
  "use_deformable_box_attn": false,
  "box_attn_type": "roi_align",
  "dec_layer_number": null,
  "decoder_layer_noise": false,
  "dln_xy_noise": 0.2,
  "dln_hw_noise": 0.2,
  "add_channel_attention": false,
  "add_pos_value": false,
  "two_stage_pat_embed": 0,
  "two_stage_add_query_num": 0,
  "two_stage_learn_wh": false,
  "two_stage_default_hw": 0.05,
  "two_stage_keep_all_tokens": false,
  "num_select": 40,
  "batch_norm_type": "FrozenBatchNorm2d",
  "masks": false,
  "aux_loss": true,
  "set_cost_class": 1.0,
  "set_cost_bbox": 5.0,
  "set_cost_giou": 2.0,
  "cls_loss_coef": 2.5,
  "bbox_loss_coef": 5.0,
  "giou_loss_coef": 2.0,
  "enc_loss_coef": 1.0,
  "interm_loss_coef": 1.0,
  "no_interm_box_loss": false,
  "mask_loss_coef": 1.0,
  "dice_loss_coef": 1.0,
  "focal_alpha": 0.25,
  "focal_gamma": 2.5,
  "decoder_sa_type": "sa",
  "matcher_type": "HungarianMatcher",
  "decoder_module_seq": [
    "sa",
    "ca",
    "ffn"
  ],
  "nms_iou_threshold": -1,
  "dec_pred_class_embed_share": true,
  "match_unstable_error": true,
  "use_ema": true,
  "ema_decay": 0.9997,
  "ema_epoch": 0,
  "use_detached_boxes_dec_out": false,
  "use_coco_eval": false,
  "dn_scalar": 100,
  "label_list": [
    "red_block",
    "blue_block",
    "pink_block",
    "slider",
    "drawer",
    "led",
    "light_bulb",
    "led_button",
    "switch",
    "cabin",
    "gripper"
  ]
}