| train_file: wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl |
| dev_file: wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl |
| test_file: null |
| model: |
| _target_: models.deberta.DebertaV2ForMultipleChoicePreTrain.from_pretrained |
| mlp_hidden_size: 3072 |
| fs_checkpoint: false |
| fs_checkpoint_offload_to_cpu: false |
| read_tensor: |
| _target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features |
| max_neg_num: 3 |
| aug_num: 1 |
| max_seq_length: 256 |
| shuffle_context: true |
| min_rep_num: 5 |
| geo_p: 0.4 |
| deduct_ratio: 1.0 |
| context_ratio: 1.0 |
| num_workers: 32 |
| extended_vocab: null |
| collator: |
| _target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext |
| max_seq_length: 256 |
| tokenizer: pretrained-models/deberta-v2-xlarge |
| mlm_probability: 0.15 |
| max_option_num: 4 |
| swap: true |
| num_workers: 4 |
| prefetch_factor: 4 |
| model_name_or_path: pretrained-models/deberta-v2-xlarge |
| pretrain: null |
| output_dir: experiments/deberta.v2.xlarge.path.v7_v8.2.2.1aug.ctx.A100.v1.4.w2.s${seed}.fsdp.adamw |
| do_train: Train |
| evaluate_during_training: true |
| do_eval: false |
| eval_sub_path: null |
| do_preprocess: false |
| per_gpu_train_batch_size: 4 |
| per_gpu_eval_batch_size: 4 |
| learning_rate: 1.0e-05 |
| gradient_accumulation_steps: 512 |
| weight_decay: 0.01 |
| adam_epsilon: 1.0e-06 |
| adam_betas: (0.9, 0.98) |
| max_grad_norm: 5.0 |
| num_train_epochs: 1 |
| max_steps: 200 |
| warmup_proportion: 0.2 |
| warmup_steps: 0 |
| optimizer: null |
| use_nvlamb: null |
| bit_training: null |
| multi_tensor: null |
| logging_steps: 1 |
| save_steps: 50 |
| eval_steps: 50 |
| no_cuda: false |
| seed: 42 |
| local_rank: 0 |
| fp16: true |
| fp16_opt_level: O2 |
| ds_cfg: |
| train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} |
| gradient_accumulation_steps: ${gradient_accumulation_steps} |
| optimizer: |
| type: AdamW |
| params: |
| lr: ${learning_rate} |
| betas: |
| - 0.9 |
| - 0.999 |
| eps: ${adam_epsilon} |
| weight_decay: ${weight_decay} |
| scheduler: |
| type: WarmupDecayLR |
| params: |
| total_num_steps: null |
| warmup_max_lr: ${learning_rate} |
| warmup_num_steps: null |
| warmup_type: linear |
| gradient_clipping: ${max_grad_norm} |
| fp16: |
| enabled: ${fp16} |
| initial_scale_power: 12 |
| zero_optimization: |
| stage: 3 |
| steps_per_print: 1024 |
| reshard_after_forward: false |
| flatten_parameters: true |
| move_grads_to_cpu: false |
| move_params_to_cpu: false |
| n_gpu: 1 |
| device: cuda:0 |
| train_batch_size: 4 |
| eval_batch_size: 4 |
| note: null |
|
|