loggers: tensorboard: _target_: src.trainer.TensorBoardLogger save_dir: ./ name: '' version: null callbacks: lr_monitor: _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor grad_norm: _target_: src.callbacks.grad_norm.GradNorm norm_type: 2 group_separator: / histogram_freq: null check_clipping: false log_weight_distribution: false only_total: true speed_monitor: _target_: src.callbacks.speed_monitor.SpeedMonitor grad_accum: _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler scheduling: 0: 2 model_checkpoint: _target_: src.callbacks.model_checkpoint.ModelCheckpoint dirpath: .checkpoints filename: '{step}' enable_version_counter: false every_n_train_steps: 1000 save_top_k: -1 save_last: link verbose: true save_initial_checkpoint: true out_parent_folder: model_train tok_name: bytelevel2 run_folder: . dataset: common-corpus pwd: /home/zg258/rds/hpc-work/infotokenization train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel2/train val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel2/validation model: fw57M-tied resume_from_checkpoint: .checkpoints/last.ckpt save_initial_checkpoint: true seed: 42 torch_compile: true data: batch_size: 64 eval_batch_size: 64 shuffle: true drop_last: false num_workers: 12 pin_memory: true persistent_workers: false prefetch_factor: 2 multiprocessing_context: null optim: optim_name: adamw lr: 0.0006 weight_decay: 0.01 optim_kwargs: fused: true eps: 1.0e-08 betas: - 0.9 - 0.95 scheduler_name: warmup_stable_decay num_warmup_steps: 2000 scheduler_kwargs: num_stable_steps: 44000 num_decay_steps: 4000 min_lr_ratio: 0.01 trainer: accelerator: gpu devices: 1 precision: bf16-true deterministic: false log_every_n_steps: 1 enable_progress_bar: true fast_dev_run: false gradient_clip_val: 1.0 gradient_clip_algorithm: norm val_check_interval: 1000 max_steps: 50000 limit_val_batches: 500 evaluation: blimp: false