| loggers: | |
| tensorboard: | |
| _target_: src.trainer.TensorBoardLogger | |
| save_dir: ./ | |
| name: '' | |
| version: null | |
| callbacks: | |
| lr_monitor: | |
| _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor | |
| grad_norm: | |
| _target_: src.callbacks.grad_norm.GradNorm | |
| norm_type: 2 | |
| group_separator: / | |
| histogram_freq: null | |
| check_clipping: false | |
| log_weight_distribution: false | |
| only_total: true | |
| speed_monitor: | |
| _target_: src.callbacks.speed_monitor.SpeedMonitor | |
| grad_accum: | |
| _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler | |
| scheduling: | |
| 0: 2 | |
| model_checkpoint: | |
| _target_: src.callbacks.model_checkpoint.ModelCheckpoint | |
| dirpath: .checkpoints | |
| filename: '{step}' | |
| enable_version_counter: false | |
| every_n_train_steps: 1000 | |
| save_top_k: -1 | |
| save_last: link | |
| verbose: true | |
| save_initial_checkpoint: true | |
| out_parent_folder: model_train | |
| tok_name: bytelevel2 | |
| run_folder: . | |
| dataset: common-corpus | |
| pwd: /home/zg258/rds/hpc-work/infotokenization | |
| train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel2/train | |
| val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel2/validation | |
| model: fw57M-tied | |
| resume_from_checkpoint: .checkpoints/last.ckpt | |
| save_initial_checkpoint: true | |
| seed: 42 | |
| torch_compile: true | |
| data: | |
| batch_size: 64 | |
| eval_batch_size: 64 | |
| shuffle: true | |
| drop_last: false | |
| num_workers: 12 | |
| pin_memory: true | |
| persistent_workers: false | |
| prefetch_factor: 2 | |
| multiprocessing_context: null | |
| optim: | |
| optim_name: adamw | |
| lr: 0.0006 | |
| weight_decay: 0.01 | |
| optim_kwargs: | |
| fused: true | |
| eps: 1.0e-08 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| scheduler_name: warmup_stable_decay | |
| num_warmup_steps: 2000 | |
| scheduler_kwargs: | |
| num_stable_steps: 44000 | |
| num_decay_steps: 4000 | |
| min_lr_ratio: 0.01 | |
| trainer: | |
| accelerator: gpu | |
| devices: 1 | |
| precision: bf16-true | |
| deterministic: false | |
| log_every_n_steps: 1 | |
| enable_progress_bar: true | |
| fast_dev_run: false | |
| gradient_clip_val: 1.0 | |
| gradient_clip_algorithm: norm | |
| val_check_interval: 1000 | |
| max_steps: 50000 | |
| limit_val_batches: 500 | |
| evaluation: | |
| blimp: false | |