| # @package _group_ | |
| common: | |
| fp16: true | |
| log_format: json | |
| log_interval: 200 | |
| tensorboard_logdir: tb | |
| checkpoint: | |
| no_epoch_checkpoints: true | |
| save_interval_updates: 50000 | |
| keep_interval_updates: 1 | |
| distributed_training: | |
| distributed_world_size: 16 | |
| ddp_backend: legacy_ddp | |
| task: | |
| _name: masked_lm | |
| data: ??? | |
| sample_break_mode: complete_doc | |
| tokens_per_sample: 512 | |
| include_target_tokens: true | |
| random_token_prob: 0 | |
| leave_unmasked_prob: 0 | |
| mask_prob: 0.35 | |
| mask_multiple_length: 4 | |
| criterion: model | |
| dataset: | |
| max_tokens: 8192 | |
| ignore_unused_valid_subsets: true | |
| skip_invalid_size_inputs_valid_test: true | |
| optimizer: | |
| _name: adam | |
| weight_decay: 0.01 | |
| adam_betas: (0.9,0.98) | |
| adam_eps: 1e-06 | |
| lr_scheduler: | |
| _name: cosine | |
| warmup_updates: 10000 | |
| optimization: | |
| clip_norm: 5 | |
| lr: [0.0002] | |
| max_update: 1000000 | |
| update_freq: [1] | |
| model: | |
| _name: data2vec_text | |
| head_layers: 2 | |
| average_top_k_layers: 10 | |
| layer_norm_target_layer: true | |
| loss_scale: 1 | |
| ema_decay: 0.999 | |
| ema_end_decay: 0.9999 | |
| ema_anneal_end_step: 300000 | |
| loss_beta: 4 | |
| ema_transformer_layers_only: true | |
| transformer: | |
| dropout: 0.1 | |
| attention_dropout: 0.1 | |
| layernorm_embedding: true | |
| activation_fn: gelu | |
| no_scale_embedding: true | |
| max_source_positions: 512 | |
| encoder: | |
| embed_dim: 768 | |
| ffn_embed_dim: 3072 | |
| layers: 12 | |
| attention_heads: 12 | |
| normalize_before: false | |
| learned_pos: true | |
| layerdrop: 0 | |