| activation: softmax |
| adam_beta1: 0.9 |
| adam_beta2: 0.99 |
| adam_epsilon: 1.0e-06 |
| alpha: 0.1 |
| attn_implementation: null |
| beta: 0.125 |
| bf16: true |
| block_size: 512 |
| checkpoint_dir: mlruns/896390784617014591/892b97fa0aa6499288906c463545ae00/checkpoints |
| compile: false |
| config_path: configs/JZ/NRJ_base-wiki-original.yaml |
| dataloader_num_workers: 8 |
| dataset_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/ |
| ddp_find_unused_parameters: false |
| disable_tqdm: true |
| do_eval: true |
| dropout: 0.1 |
| embedding_dim: 768 |
| eval_steps: 25000 |
| evaluation_strategy: steps |
| forward_memories: 3072 |
| fp16: false |
| gradient_accumulation_steps: 1 |
| ignore_lines: false |
| layer_norm: 1.0e-12 |
| learning_rate: 0.0007 |
| log_on_each_node: false |
| logging_steps: 1000 |
| logging_strategy: steps |
| lr_scheduler_kwargs: {} |
| lr_scheduler_type: cosine |
| max_steps: 500000 |
| model_name: NRJ-V_30000K_bpe-NL12-NH12-EMB768-FFN3072 |
| model_type: energyBERT |
| n_run: 51 |
| num_heads: 12 |
| num_layers: 12 |
| num_params: 50638896 |
| optimizer: adamw_torch |
| output_dir: null |
| per_device_eval_batch_size: 8 |
| per_device_train_batch_size: 64 |
| remove_unused_columns: false |
| report_to: mlflow |
| save_steps: 25000 |
| save_strategy: steps |
| seed: 42 |
| share_layers: false |
| test_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.test.txt |
| tie_weights: false |
| tokenizer_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/tokenizer |
| tokenizer_type: bpe |
| total_batch_size: 4096 |
| training_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.train.txt |
| valid_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.valid.txt |
| vocabulary_size: 30000 |
| warmup_ratio: 0.0 |
| warmup_steps: 24000 |
| weight_decay: 0.01 |
|
|