| config: | |
| vocab_size: 128128 | |
| bos_token_id: 2 | |
| eos_token_id: 1 | |
| pad_token_id: 3 | |
| torch_dtype: bfloat16 | |
| use_cache: false | |
| max_position_embeddings: 2048 | |
| _attn_implementation: flash_attention_2 | |
| name: small | |
| model_type: llama | |
| head_dim: 128 | |
| hidden_size: 768 | |
| hidden_act: silu | |
| intermediate_size: 2048 | |
| initializer_range: 0.02 | |
| num_hidden_layers: 6 | |
| num_attention_heads: 6 | |
| num_key_value_heads: 6 | |
| rms_norm_eps: 1.0e-05 | |
| tie_word_embeddings: true | |
| rope_theta: 10000.0 | |
| rope_scaling: null | |
| attention_bias: false | |
| mlp_bias: false | |
| attention_dropout: 0.0 | |
| pretraining_tp: 1 | |
| optim_config: | |
| optim_name: adamw | |
| lr: 0.0006 | |
| weight_decay: 0.01 | |
| weight_decay_embedding: false | |
| set_grad_to_none: true | |
| optim_kwargs: | |
| fused: true | |
| eps: 1.0e-08 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| capturable: true | |
| scheduler_name: warmup_stable_decay | |
| num_warmup_steps: 2000 | |
| scheduler_kwargs: | |
| num_decay_steps: 4000 | |
| min_lr_ratio: 0.0 | |
| grad_acc_schedule: | |
| 0: 2 | |
| zloss_factor: null | |
| use_torch_compile: true | |
| use_liger: true | |
| train_data_path: /home/pl487/unimixlm/data/multigram128k/train | |
| val_data_path: /home/pl487/unimixlm/data/multigram128k/validation | |
| seq_len: 2048 | |
| eos_token_id: 1 | |
| dataloader_config: | |
| batch_size: 64 | |
| eval_batch_size: 64 | |
| shuffle_seed: 42 | |
| intra_doc_causal_mask: true | |
| num_workers: 8 | |
| pin_memory: true | |
| drop_last: true | |
| persistent_workers: false | |
| multiprocessing_context: null | |
| prefetch_factor: 2 | |