| _BASE_: "base_model_bert_l12_h192.yaml" | |
| SHARED_TARGETS: | |
| # - | |
| # NAME: 'ImageNet1k' | |
| # SHARED_TARGETS_CFG: | |
| # FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl' | |
| # DISTRIBUTED: False | |
| - | |
| NAME: 'Vocab_Word' | |
| SHARED_TARGETS_CFG: | |
| FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl' | |
| DISTRIBUTED: True | |
| # - | |
| # NAME: 'Kinetics400' | |
| # SHARED_TARGETS_CFG: | |
| # FILE_PATH: 'open_source_dataset/k400_class_name_CLIP_with_endoftext.pkl' | |
| # DISTRIBUTED: False | |
| TASKS: | |
| # - | |
| # NAME: imagenet | |
| # DATASETS: | |
| # TRAIN: 'ImageNetDataset' | |
| # VAL: 'ImageNetDataset' | |
| # TASK_TYPE: 'image_classification' | |
| # DATASET_NAME: 'ImageNet1k' | |
| # TARGET_SET: ['ImageNet1k'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 720 | |
| # # TEST_BATCH_SIZE: 2 | |
| # NUM_WORKERS: 4 | |
| # FEATS_FOLDER: 'cluster2:s3://imagenet' | |
| # ANNO_FOLDER: 'open_source_dataset/imagenet/meta' | |
| # SAMPLING_WEIGHT: 2.5 | |
| # CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl' | |
| # MIXUP: 0.8 | |
| # CUTMIX: 1.0 | |
| # MIXUP_PROB: 1.0 | |
| # MIXUP_SWITCH_PROB: 0.5 | |
| # MIXUP_MODE: 'batch' | |
| # MIXUP_LABEL_SMOOTHING: 0.1 | |
| # MODEL: | |
| # MAX_SEQ_LEN: -1 | |
| # LABELS_NUM: 1000 | |
| # TEMP_NAME: logit_scale_img_cls | |
| # LOSSES: | |
| # NAMES: ['SoftTargetCrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 1.0 | |
| # REDUCTION: 'mean' | |
| # # LOSS_FP32: True | |
| # INFERENCE: | |
| # NAME: 'ImageNetEvaler' | |
| # ID_KEY: 'image_id' | |
| # VALUE: 'cls_logits' | |
| # VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt' | |
| # TEST_ANNFILE: '' | |
| # GENERATION_MODE: False | |
| # - | |
| # NAME: K400_retrieve | |
| # DATASETS: | |
| # TRAIN: 'VideoDataSet' | |
| # VAL: 'VideoDataSet' | |
| # TASK_TYPE: 'video_classification' | |
| # DATASET_NAME: 'K400' | |
| # TARGET_SET: ['Kinetics400'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 12 # 256 | |
| # TEST_BATCH_SIZE: 4 # debug | |
| # NUM_WORKERS: 4 # debug 4 | |
| # FEATS_FOLDER: 'open_source_dataset/K400_official' | |
| # ANNO_FOLDER: 'open_source_dataset/K400_official' | |
| # S3_PATH: 's3://K400/' | |
| # FRAMES_PER_CLIP: 8 | |
| # STRIDE: 32 | |
| # FILE_EXTENSION: '' | |
| # ANNO_FILE: 'annotation.json' | |
| # TIMESFORMER_AUG: True | |
| # SAMPLING_WEIGHT: 1.0 | |
| # MODEL: | |
| # MAX_SEQ_LEN: -1 | |
| # TEMP_NAME: logit_scale_video_cls | |
| # LOSSES: | |
| # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] | |
| # LABELSMOOTHING: 0.1 | |
| # LOSS_WEIGHT: 1.0 | |
| # INFERENCE: | |
| # NAME: 'MiTEvaler' | |
| # ID_KEY: 'video_name' | |
| # VALUE: 'label' | |
| # VAL_ANNFILE: 'open_source_dataset/K400_official/annotation.json' | |
| # TEST_ANNFILE: '' | |
| # GENERATION_MODE: False | |
| # NUM_VIEWS: 1 | |
| # - | |
| # NAME: bookswiki_pretrain | |
| # DATASETS: | |
| # TRAIN: 'GeneralCorpusDataset' | |
| # TASK_TYPE: 'text_mlm' | |
| # DATASET_NAME: 'BooksWiki' | |
| # TARGET_SET: ['Vocab_Word'] | |
| # VERSION: 'v2' | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 512 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 2 | |
| # ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki' | |
| # # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki' | |
| # SEQ_PER_SAMPLE: 1 | |
| # SAMPLER: NodeDistributed | |
| # CACHE_MODE: True | |
| # SEQ_PER_SAMPLE: 128 | |
| # MIN_SEQ_PER_SAMPLE: 128 | |
| # APPEND_EOS: True | |
| # ONE_STREAM: False | |
| # SAMPLING_WEIGHT: 3.5 | |
| # RANDOM_MASK: True | |
| # MODEL: | |
| # MAX_SEQ_LEN: 128 | |
| # TEMP_NAME: logit_scale_text_mlm | |
| # LOSSES: | |
| # NAMES: ['CrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 0.33333 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # GENERATION_MODE: False | |
| # - | |
| # NAME: mscoco_retrieve | |
| # DATASETS: | |
| # TRAIN: 'ImageTextPairDataset' | |
| # TEST: 'ImageTextPairDataset' | |
| # TASK_TYPE: 'image_retrieval' | |
| # DATASET_NAME: 'MSCOCO' | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 100 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 1 | |
| # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
| # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' | |
| # S3_PATH: 's3://coco/' | |
| # SEQ_PER_SAMPLE: 1 | |
| # CACHE_MODE: True | |
| # CIRCULAR_CACHE_MODE: False | |
| # ZIP_MODE: False | |
| # CACHE_ORIGIN_IMAGE: False | |
| # RANDOM_CAPTION: False | |
| # AS_NUMPY_AS_POSSIBLE: False | |
| # SAMPLING_WEIGHT: 1.0 | |
| # TRANSFORM: 'clip_transforms' | |
| # MODEL: | |
| # MAX_SEQ_LEN: 50 | |
| # TEMP_NAME: logit_scale_retrieve | |
| # LOSSES: | |
| # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] | |
| # LABELSMOOTHING: 0.1 | |
| # LOSS_WEIGHT: 1.0 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # ID_KEY: 'image_id' | |
| # VALUE: 'caption' | |
| # NAME: 'RetrievalEvaler' | |
| # VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline' | |
| # TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline' | |
| # GENERATION_MODE: False | |
| ########## Image Captioning ########### | |
| # - | |
| # NAME: cc12m_caption | |
| # DATASETS: | |
| # TRAIN: 'ImageTextPairDataset' | |
| # TASK_TYPE: 'image_caption' | |
| # DATASET_NAME: 'CC12M' | |
| # TARGET_SET: ['Vocab_Word'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 300 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 2 | |
| # S3_ANNO_FOLDER: 's3://cc12m/' | |
| # ANNO_FOLDER: 'open_source_dataset/c12m/' | |
| # ANNO_FILENAME: 'train_available.json' | |
| # FEATS_FOLDER: 'open_source_dataset/c12m/' | |
| # S3_PATH: 's3://cc12m/' | |
| # SEQ_PER_SAMPLE: 1 | |
| # SAMPLER: NodeDistributed | |
| # CACHE_MODE: True | |
| # CIRCULAR_CACHE_MODE: False | |
| # ZIP_MODE: False | |
| # CACHE_ORIGIN_IMAGE: False | |
| # RANDOM_CAPTION: False | |
| # AS_NUMPY_AS_POSSIBLE: False | |
| # SAMPLING_WEIGHT: 1.6889 | |
| # TRANSFORM: 'clip_transforms' | |
| # MODEL: | |
| # MAX_SEQ_LEN: 50 | |
| # TEMP_NAME: logit_scale_caption | |
| # LOSSES: | |
| # NAMES: ['CrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 0.33333 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # GENERATION_MODE: False | |
| # - | |
| # NAME: cc3m_caption | |
| # DATASETS: | |
| # TRAIN: 'ImageTextPairDataset' | |
| # TASK_TYPE: 'image_caption' | |
| # DATASET_NAME: 'CC3M' | |
| # TARGET_SET: ['Vocab_Word'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 300 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 2 | |
| # ANNO_FOLDER: 's3://cc3m/' | |
| # ANNO_FILENAME: 'train_spacy.json' | |
| # FEATS_FOLDER: 'open_source_dataset/cc3m/' | |
| # S3_PATH: 's3://cc3m/' | |
| # SEQ_PER_SAMPLE: 1 | |
| # SAMPLER: NodeDistributed | |
| # CACHE_MODE: True | |
| # CIRCULAR_CACHE_MODE: False | |
| # ZIP_MODE: False | |
| # CACHE_ORIGIN_IMAGE: False | |
| # RANDOM_CAPTION: False | |
| # AS_NUMPY_AS_POSSIBLE: False | |
| # SAMPLING_WEIGHT: 0.8780 | |
| # TRANSFORM: 'clip_transforms' | |
| # MODEL: | |
| # MAX_SEQ_LEN: 50 | |
| # TEMP_NAME: logit_scale_caption | |
| # LOSSES: | |
| # NAMES: ['CrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 0.33333 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # GENERATION_MODE: False | |
| # - | |
| # NAME: vg_caption | |
| # DATASETS: | |
| # TRAIN: 'ImageTextPairDataset' | |
| # TASK_TYPE: 'image_caption' | |
| # DATASET_NAME: 'VG' | |
| # TARGET_SET: ['Vocab_Word'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 300 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 2 | |
| # FEATS_FOLDER: 'open_source_dataset/visual_genome/images' | |
| # ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations' | |
| # S3_PATH: 's3://visual_genome/images' | |
| # ANNO_FILENAME: 'vg_captions_128filter.json' | |
| # SEQ_PER_SAMPLE: 1 | |
| # CACHE_MODE: True | |
| # CIRCULAR_CACHE_MODE: False | |
| # ZIP_MODE: False | |
| # CACHE_ORIGIN_IMAGE: False | |
| # RANDOM_CAPTION: False | |
| # AS_NUMPY_AS_POSSIBLE: False | |
| # SAMPLING_WEIGHT: 0.5895 | |
| # TRANSFORM: 'clip_transforms' | |
| # MODEL: | |
| # MAX_SEQ_LEN: 30 | |
| # TEMP_NAME: logit_scale_caption | |
| # LOSSES: | |
| # NAMES: ['CrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 0.33333 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # GENERATION_MODE: True | |
| - | |
| NAME: mscoco_caption | |
| DATASETS: | |
| TRAIN: 'ImageTextPairDataset' | |
| # VAL: 'ImageTextPairDataset' | |
| TEST: 'ImageTextPairDataset' | |
| TASK_TYPE: 'image_caption' | |
| DATASET_NAME: 'MSCOCO' | |
| TARGET_SET: ['Vocab_Word'] | |
| DATALOADER: | |
| TRAIN_BATCH_SIZE: 32 | |
| TEST_BATCH_SIZE: 2 | |
| NUM_WORKERS: 4 | |
| FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
| ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' | |
| S3_PATH: 's3://coco/' | |
| SEQ_PER_SAMPLE: 1 | |
| CACHE_MODE: True | |
| CIRCULAR_CACHE_MODE: False | |
| ZIP_MODE: False | |
| CACHE_ORIGIN_IMAGE: False | |
| RANDOM_CAPTION: False | |
| AS_NUMPY_AS_POSSIBLE: False | |
| SAMPLING_WEIGHT: 0.3817 | |
| TRANSFORM: 'clip_transforms' | |
| RANDOM_MASK: True | |
| MODEL: | |
| MAX_SEQ_LEN: 50 | |
| EVAL_MAX_SEQ_LEN: 21 | |
| TEMP_NAME: logit_scale_caption | |
| LOSSES: | |
| NAMES: ['CrossEntropy', 'Accuracy'] | |
| LOSS_WEIGHT: 0.33333 | |
| REDUCTION: 'mean' | |
| DECODE_STRATEGY: | |
| NAME: 'CaptionBeamSearcherV3' | |
| BEAM_SIZE: 2 | |
| # LEN_PENALTY: 2.0 | |
| INFERENCE: | |
| NAME: 'COCOEvaler' | |
| VOCAB: 'CLIP' | |
| ID_KEY: 'image_id' | |
| VALUE: 'caption' | |
| VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json' | |
| TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json' | |
| GENERATION_MODE: True | |
| # - | |
| # NAME: sbu_caption | |
| # DATASETS: | |
| # TRAIN: 'ImageTextPairDataset' | |
| # TASK_TYPE: 'image_caption' | |
| # DATASET_NAME: 'SBU' | |
| # TARGET_SET: ['Vocab_Word'] | |
| # DATALOADER: | |
| # TRAIN_BATCH_SIZE: 300 | |
| # TEST_BATCH_SIZE: 32 | |
| # NUM_WORKERS: 1 | |
| # S3_ANNO_FOLDER: 's3://SBU/annotations' | |
| # ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations' | |
| # ANNO_FILENAME: 'subcaption.json' | |
| # FEATS_FOLDER: 'open_source_dataset/sbucaption/' | |
| # S3_PATH: 's3://SBU/images' | |
| # SEQ_PER_SAMPLE: 1 | |
| # SAMPLER: NodeDistributed | |
| # CACHE_MODE: True | |
| # CIRCULAR_CACHE_MODE: False | |
| # ZIP_MODE: False | |
| # CACHE_ORIGIN_IMAGE: False | |
| # RANDOM_CAPTION: False | |
| # AS_NUMPY_AS_POSSIBLE: False | |
| # SAMPLING_WEIGHT: 0.4618 | |
| # TRANSFORM: 'clip_transforms' | |
| # MODEL: | |
| # MAX_SEQ_LEN: 50 | |
| # TEMP_NAME: logit_scale_caption | |
| # LOSSES: | |
| # NAMES: ['CrossEntropy', 'Accuracy'] | |
| # LOSS_WEIGHT: 0.33333 | |
| # REDUCTION: 'mean' | |
| # INFERENCE: | |
| # VOCAB: 'CLIP' | |
| # GENERATION_MODE: False | |
| ENGINE: | |
| NAME: 'UnifiedTrainer' | |
| MODEL: | |
| META_ARCHITECTURE: 'MultiTaskTransformerEncoder' | |
| ENCODER: 'UnifiedBertEncoder' | |
| IN_TUNING: True # use IN1k instead of 22k | |
| SHARE_LAYERNORM: True | |
| BERT: | |
| NORMALIZE_DECISION: "BERTPre" | |
| DROP_PATH_PROB: 0.0 | |
| DROP_PATH_PROB_FIXED: True | |
| MODEL_EMA: False | |
| MODEL_EMA_DECAY: 0.9999 | |
| MAEParamsInit: True | |
| POSEMBEDFIX: True | |
| IMG_INPUT_SIZE: 224 | |
| PATCH_SIZE: 16 | |
| LAYER_SCALE: True | |
| LAYER_SCALE_INIT: 1e-3 | |
| LAYER_SCALE_FP32: True | |
| GATE_FP32: False | |
| TAG_TRANSFORM_FP32: False | |
| DATALOADER: | |
| USE_WEIGHTED_SAMPLER: True | |
| UNIFIED_DATASET: True | |
| NUM_WORKERS: 32 | |
| STRATEGY: 'turn' | |
| PADDING_TO_MAX: False # True for debugging or token moe with distributed moe | |
| ####################################### Optimizer ####################################### | |
| SOLVER: | |
| NAME: 'Adam' | |
| TORCH_OPTIMIZER: True | |
| PARAMS_SEPERATE: True | |
| # PARAMS_GROUP: True | |
| # EPOCH: 1 | |
| MAX_ITER: 150000 | |
| CHECKPOINT_PERIOD: 5000 | |
| EVAL_PERIOD: 500000 | |
| BASE_LR: 0.001 | |
| BIAS_LR_FACTOR: 1.0 | |
| WEIGHT_DECAY: 0.05 | |
| WEIGHT_DECAY_NORM: 0.0 | |
| WEIGHT_DECAY_BIAS: 0.0 | |
| WEIGHT_DECAY_EMBEDDING: 0.0 | |
| MOMENTUM: 0.9 | |
| DAMPENING: 0.0 | |
| NESTEROV: 0.0 | |
| BETAS: [0.9, 0.95] | |
| EPS: 1e-6 | |
| GRAD_CLIP: 0.1 | |
| GRAD_CLIP_TYPE: 'norm' | |
| ACCUM_ITER: 0 | |
| AMP_FP16: True | |
| APEX_FP16: False # dangerous | |
| WRITE_PERIOD: 50 | |
| MIN_LOSS_SCLE: 2048.0 | |
| # BF16: False # True | |
| # ZEROSTAGE: 2 | |
| LOSS_SCALE_WINDOW: 200 | |
| FORCE_SOFTMAX_FP16: True | |
| FORCE_LN_FP16: True | |
| FORCE_NORM_FP16: True | |
| # FORCE_TEMP_FP16: True | |
| FORCE_EMBED_FP16: True | |
| ####################################### lr scheduler ####################################### | |
| LR_SCHEDULER: | |
| NAME: 'WarmupCosine' | |
| WARMUP: 5000 | |
| MIN_LR: 0.000001 | |
| ####################################### evaluation ####################################### | |
| INFERENCE: | |
| VOCAB: 'CLIP' | |
| ITER_BASED: True | |
| find_unused_parameters: true | |
| # ENCODERS: | |
| # - | |
| # NAME: VisualEncoder | |
| # TYPE: VisualEncoder | |
| # DROP_PATH_PROB: 0.0 | |
| # HIDDEN_SIZE: 192 | |
| # HIDDEN_DROPOUT_PROB: 0. | |
| # HIDDEN_ACT: "gelu" | |
| # NUM_ATTENTION_HEADS: 3 | |
| # INTERMEDIATE_SIZE: 768 | |
| # INTERMEDIATE_DROP: 0. | |
| # FFN_DROPOUT_PROB: 0. | |
| # ATTENTION_PROBS_DROPOUT_PROB: 0. | |
| # NUM_HIDDEN_LAYERS: 6 | |
| # NUM_GENERATION_LAYERS: 0 | |
| # DROP_PATH_PROB_FIXED: True | |
| # - | |
| # NAME: TextEncoder | |
| # TYPE: TextEncoder | |
| # DROP_PATH_PROB: 0.0 | |
| # HIDDEN_SIZE: 192 | |
| # HIDDEN_DROPOUT_PROB: 0. | |
| # HIDDEN_ACT: "gelu" | |
| # NUM_ATTENTION_HEADS: 3 | |
| # INTERMEDIATE_SIZE: 768 | |
| # INTERMEDIATE_DROP: 0. | |
| # FFN_DROPOUT_PROB: 0. | |
| # ATTENTION_PROBS_DROPOUT_PROB: 0. | |
| # NUM_HIDDEN_LAYERS: 6 | |
| # NUM_GENERATION_LAYERS: 0 | |
| # DROP_PATH_PROB_FIXED: True | |
| MOE: | |
| MOE: True | |
| MOE_TYPE: 'attribute' | |
| TAG_Transform: True | |
| ATTRIBUTE_LENGTH: 8 | |
| EP_WORLD_SIZE: 1 # tag moe only | |
| NUM_EXPERTS: 8 | |
| TOP_K: 2 | |
| CAPACITY_FACTOR: 3.0 | |
| EVAL_MIN_CAPACITY: 4.0 | |
| MIN_CAPACITY: 4 | |
| NOISY_GATE_POLICY: 'vmoe' | |
| MOE_PARAM_GROUP: True | |
| MOE_EXPERT_TYPE: 'FFN,SA' | |
| SA_LINEAR_OUT_MOE: True | |
| MOE_EXPERT_LOCATION: 'all' # 'odd' | |
| # MOE_LAYER_START_IDX: 3 | |
| # MOE_LAYER_END_IDX: 21 | |
| # MOE_LAYER_START_IDX: 18 | |
| # MOE_LAYER_END_IDX: 12 | |
| BATCH_PRIO: True | |
| USE_TUTEL: True | |
| FFN_SHARE_GATE_DECISION: True |