| _BASE_: "base_model_bert_l12_h192.yaml" | |
| SHARED_TARGETS: | |
| - | |
| NAME: 'VQA_Answer' | |
| SHARED_TARGETS_CFG: | |
| FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl' | |
| DISTRIBUTED: True | |
| TASKS: | |
| - | |
| NAME: vqa | |
| DATASETS: | |
| TRAIN: 'VQADataset' | |
| VAL: 'VQADataset' | |
| DATASET_NAME: 'VQA' | |
| TASK_TYPE: 'vqa' | |
| TARGET_SET: ['VQA_Answer'] | |
| DATALOADER: | |
| TRAIN_BATCH_SIZE: 256 | |
| TEST_BATCH_SIZE: 128 | |
| NUM_WORKERS: 4 | |
| FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
| ANNO_FOLDER: 'open_source_dataset/VQA' | |
| SEQ_PER_SAMPLE: 1 | |
| MAX_FEAT_NUM: 51 | |
| SAMPLING_WEIGHT: 1.0 | |
| TRANSFORM: 'clip_transforms' | |
| DO_AS_GEN: True | |
| SINGLE_CLASS: True | |
| MODEL: | |
| # VOCAB_SIZE: 49409 # include <BOS>/<EOS> | |
| PREDICTOR: 'MLPClassifer' | |
| # MM_PREDICTOR: | |
| # LABELS_NUM: 3129 | |
| # PREDICT: 'first_one' | |
| # PRED_DROPOUT: 0.5 | |
| MAX_SEQ_LEN: 23 | |
| # QUERY_EMBED: | |
| # NAME: QueryBaseEmbedding | |
| # DIM: 512 | |
| # QUERY_SIZE: 10 # more than 1 is ok | |
| # ACTIVATION: 'none' | |
| # USE_NORM: True | |
| # DROPOUT: 0.1 | |
| # POSITION: 'none' # must be none now | |
| # TYPE_VOCAB_SIZE: -1 # must < 0 | |
| LOSSES: | |
| # not single class | |
| # NAMES: ['BCEWithLogits'] | |
| # LOSS_WEIGHT: 0.05 | |
| # for single class | |
| NAMES: ['CrossEntropy'] | |
| LOSS_WEIGHT: 0.1 | |
| INFERENCE: | |
| VOCAB: 'CLIP' | |
| NAME: 'VQAEvaler' | |
| ID_KEY: 'question_id' | |
| VALUE: 'answer' | |
| VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl' | |
| TEST_ANNFILE: '' | |
| GENERATION_MODE: False | |
| ######################################### Engine ######################################### | |
| ENGINE: | |
| NAME: 'UnifiedTrainer' | |
| ######################################### Scheduled sampling ######################################### | |
| SCHEDULED_SAMPLING: | |
| START_EPOCH: 0 | |
| INC_EVERY_EPOCH: 5 | |
| INC_PROB: 0.05 | |
| MAX_PROB: 0.25 | |
| DATALOADER: | |
| USE_WEIGHTED_SAMPLER: True | |
| UNIFIED_DATASET: True | |
| ######################################### MODEL ######################################### | |
| MODEL: | |
| TEMP_NAME: logit_scale_downstream | |
| # VOCAB_SIZE: 49409 # include <BOS>/<EOS> | |
| META_ARCHITECTURE: 'MultiTaskTransformerEncoder' | |
| ENCODER: 'UnifiedBertEncoder' | |
| # ENCODER_DIM: 512 | |
| # DECODER: 'UnifiedTransformerDecoder' | |
| # DECODER_DIM: 512 | |
| BertParamsInit: True | |
| # WEIGHTS: open_source_dataset/our_model/cc3m_encoder_decoder_warm1w_150k_retrivetask_gatherfeature_caption_mlm/model_Epoch_90000_Iter_0089999.pth | |
| CLS_TOKEN: True | |
| # PREDICTOR: 'BasePredictor' | |
| # PRED_DROPOUT: 0.5 | |
| # MAX_SEQ_LEN: 20 | |
| # #################################### Token embedding #################################### | |
| # TOKEN_EMBED: | |
| # NAME: 'TokenBaseEmbedding' | |
| # DIM: 512 | |
| # ACTIVATION: 'none' | |
| # USE_NORM: True | |
| # DROPOUT: 0.1 | |
| # POSITION: 'NNEmbeddingEncoding' | |
| # POSITION_MAX_LEN: 512 | |
| # TYPE_VOCAB_SIZE: 2 | |
| # #################################### Visual embedding #################################### | |
| # VISUAL_EMBED: | |
| # NAME: 'VisualPatchEmbedding' | |
| # IN_DIM: 3 | |
| # OUT_DIM: 512 | |
| # ACTIVATION: 'none' | |
| # USE_NORM: True | |
| # DROPOUT: 0.0 | |
| # PATCH_SIZE: 16 | |
| ####################################### BERT ############################################ | |
| BERT: | |
| DROP_PATH_PROB: 0.05 | |
| # HIDDEN_SIZE: 512 | |
| HIDDEN_SIZE: 192 | |
| HIDDEN_DROPOUT_PROB: 0. | |
| HIDDEN_ACT: "gelu" | |
| NUM_ATTENTION_HEADS: 8 | |
| INTERMEDIATE_SIZE: 2048 | |
| INTERMEDIATE_DROP: 0. | |
| FFN_DROPOUT_PROB: 0. | |
| ATTENTION_PROBS_DROPOUT_PROB: 0. | |
| NUM_HIDDEN_LAYERS: 6 | |
| NUM_GENERATION_LAYERS: 6 | |
| ####################################### Optimizer ####################################### | |
| SOLVER: | |
| NAME: 'AdamW' | |
| # EPOCH: 1 | |
| MAX_ITER: 30000 | |
| CHECKPOINT_PERIOD: 5000 | |
| CHECKPOINT_MAX_SAVE: 5 | |
| EVAL_PERIOD: 1000 | |
| BASE_LR: 0.00005 | |
| BIAS_LR_FACTOR: 1.0 | |
| WEIGHT_DECAY: 0.01 | |
| WEIGHT_DECAY_NORM: 0.0 | |
| WEIGHT_DECAY_BIAS: 0.0 | |
| MOMENTUM: 0.9 | |
| DAMPENING: 0.0 | |
| NESTEROV: 0.0 | |
| BETAS: [0.9, 0.999] | |
| EPS: 1e-8 | |
| GRAD_CLIP: 5.0 | |
| GRAD_CLIP_TYPE: 'norm' | |
| ACCUM_ITER: 0 | |
| AMP_FP16: True | |
| APEX_FP16: False # dangerous | |
| CHECKPOINT_MAPPING: | |
| # - | |
| # ORIGIN: cc3m_caption | |
| # DEST: mscoco | |
| - | |
| ORIGIN: cc3m_retrieve | |
| DEST: flickr30k | |
| CHECKPOINT_MAP: True | |
| ####################################### lr scheduler ####################################### | |
| LR_SCHEDULER: | |
| NAME: 'WarmupCosine' | |
| WARMUP: 1000 | |
| MIN_LR: 0.00000001 | |
| # ####################################### losses ####################################### | |
| # LOSSES: | |
| # NAMES: ['LabelSmoothing'] | |
| # LABELSMOOTHING: 0.1 | |
| ####################################### decode strategy ####################################### | |
| # DECODE_STRATEGY: | |
| # NAME: 'BeamSearcher' | |
| # BEAM_SIZE: 2 | |
| ####################################### evaluation ####################################### | |
| INFERENCE: | |
| VOCAB: 'CLIP' | |
| ITER_BASED: True | |
| find_unused_parameters: true | |