| loggers: |
| tensorboard: |
| _target_: primer.trainer.TensorBoardLogger |
| save_dir: ./ |
| name: '' |
| version: null |
| callbacks: |
| lr_monitor: |
| _target_: primer.callbacks.lr_monitor.SimpleLearningRateMonitor |
| grad_norm: |
| _target_: primer.callbacks.grad_norm.GradNorm |
| norm_type: 2 |
| group_separator: / |
| histogram_freq: null |
| check_clipping: false |
| log_weight_distribution: false |
| only_total: true |
| speed_monitor: |
| _target_: primer.callbacks.speed_monitor.SpeedMonitor |
| model_checkpoint: |
| _target_: primer.callbacks.model_checkpoint.ModelCheckpoint |
| dirpath: .checkpoints |
| filename: '{step}' |
| enable_version_counter: false |
| every_n_train_steps: 2000 |
| save_top_k: -1 |
| save_last: link |
| verbose: true |
| save_initial_checkpoint: true |
| model: |
| name: small |
| model_type: llama |
| head_dim: 128 |
| hidden_size: 768 |
| hidden_act: silu |
| intermediate_size: 2048 |
| initializer_range: 0.02 |
| num_hidden_layers: 6 |
| num_attention_heads: 6 |
| num_key_value_heads: 6 |
| rms_norm_eps: 1.0e-05 |
| tie_word_embeddings: true |
| rope_theta: 10000.0 |
| rope_scaling: null |
| attention_bias: false |
| mlp_bias: false |
| attention_dropout: 0.0 |
| pretraining_tp: 1 |
| pwd: /home/pl487/unimixlm |
| out_parent_folder: model_train |
| run_folder: small_multigram128k__2025-07-22T23-38-25 |
| tok_path: /home/pl487/unimixlm/tokenizers/multigram128k |
| tok_subfolder: null |
| train_data_path: /home/pl487/unimixlm/data/multigram128k/train |
| val_data_path: /home/pl487/unimixlm/data/multigram128k/validation |
| resume_from_checkpoint: .checkpoints/last.ckpt |
| save_initial_checkpoint: true |
| seed: 42 |
| torch_compile: true |
| use_liger: true |
| data: |
| batch_size: 64 |
| eval_batch_size: 64 |
| shuffle_seed: 42 |
| drop_last: true |
| num_workers: 8 |
| pin_memory: true |
| persistent_workers: false |
| prefetch_factor: 2 |
| multiprocessing_context: null |
| intra_doc_causal_mask: true |
| optim: |
| optim_name: adamw |
| lr: 0.0006 |
| grad_acc_schedule: |
| 0: 2 |
| zloss_factor: null |
| weight_decay: 0.01 |
| optim_kwargs: |
| fused: true |
| eps: 1.0e-08 |
| betas: |
| - 0.9 |
| - 0.95 |
| capturable: true |
| scheduler_name: warmup_stable_decay |
| num_warmup_steps: 2000 |
| scheduler_kwargs: |
| num_decay_steps: 4000 |
| min_lr_ratio: 0.0 |
| weight_decay_embedding: false |
| set_grad_to_none: true |
| trainer: |
| accelerator: gpu |
| devices: 1 |
| precision: bf16-true |
| deterministic: false |
| log_every_n_steps: 1 |
| enable_progress_bar: true |
| fast_dev_run: false |
| gradient_clip_val: 1.0 |
| gradient_clip_algorithm: norm |
| val_check_interval: 2000 |
| max_steps: 50000 |
| limit_val_batches: 500 |
| limit_train_batches: null |
| tok_name: multigram128k |
|
|