File size: 48,209 Bytes
[2026-01-25 09:54:39,812] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:13320] baseline 0.000GB ()
[2026-01-25 09:54:39,813] [INFO] [axolotl.cli.config.load_cfg:259] [PID:13320] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "train.yml",
  "base_model": "google/gemma-3-4b-it",
  "base_model_config": "google/gemma-3-4b-it",
  "batch_size": 13,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 9,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "AlexHung29629/MerlynIfeEldridge2",
      "trust_remote_code": false,
      "type": "input_output"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 13,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": false
  },
  "include_tkps": true,
  "is_multimodal": true,
  "learning_rate": 0.001,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "liger_use_token_scaling": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 13,
  "model_config_type": "gemma3",
  "num_epochs": 32.0,
  "optimizer": "sgd",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./model-out",
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "google/gemma-3-4b-it",
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_strategy": "no",
  "seed": 42,
  "sequence_len": 758,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "google/gemma-3-4b-it",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_tensorboard": true,
  "use_wandb": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.0,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-01-25 09:54:39,935] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:13320] Loaded image size: 896 from model config
[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos>
[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos>
[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad>
[2026-01-25 09:54:42,062] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk>
[2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:13320] Unable to find prepared dataset in last_run_prepared/79c123e6ef0babe72cf6db37825069f8
[2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:13320] Loading raw datasets...
[2026-01-25 09:54:42,063] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:13320] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2026-01-25 09:54:42,948] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:13320] Loading dataset: AlexHung29629/MerlynIfeEldridge2 with base_type: input_output and prompt_style: None
[2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:13320] min_input_len: 152
[2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:13320] max_input_len: 676

Saving the dataset (0/1 shards):   0%|                                                                                                           | 0/13 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 63.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 63.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 45.17 examples/s]
[2026-01-25 09:54:43,829] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:13320] total_num_tokens: 4_827
[2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:13320] `total_supervised_tokens: 43`
[2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:13320] total_num_steps: 32
[2026-01-25 09:54:43,832] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:13320] Maximum number of steps set at 32
[2026-01-25 09:54:43,942] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:13320] loading tokenizer... google/gemma-3-4b-it
[2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos>
[2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos>
[2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad>
[2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk>
[2026-01-25 09:54:54,079] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:13320] Loading model
[2026-01-25 09:54:54,167] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:13320] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-25 09:54:54,169] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:13320] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-25 09:54:54,266] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:13320] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True}

Loading checkpoint shards:   0%|                                                                                                                         | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|████████████████████████████████████████████████████████▌                                                        | 1/2 [00:01<00:01,  1.96s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.54s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.60s/it]
[2026-01-25 09:55:10,541] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:13320] Memory usage after model load 0.000GB ()
[2026-01-25 09:56:09,234] [INFO] [axolotl.train.save_initial_configs:417] [PID:13320] Pre-saving tokenizer to ./model-out...
[2026-01-25 09:56:09,770] [INFO] [axolotl.train.save_initial_configs:422] [PID:13320] Pre-saving model config to ./model-out...
[2026-01-25 09:56:09,777] [INFO] [axolotl.train.save_initial_configs:426] [PID:13320] Pre-saving processor to ./model-out...
[2026-01-25 09:56:13,230] [INFO] [axolotl.train.execute_training:212] [PID:13320] Starting trainer...

  0%|                                                                                                                                                   | 0/32 [00:00<?, ?it/s]
  3%|████▎                                                                                                                                      | 1/32 [00:07<03:38,  7.05s/it]
                                                                                                                                                                               
{'loss': 0.0345, 'grad_norm': 61.53063201904297, 'learning_rate': 0.001, 'ppl': 1.0351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 6.18751859664917, 'tokens/total': 9152, 'tokens/trainable': 43, 'epoch': 1.0}

  3%|████▎                                                                                                                                      | 1/32 [00:07<03:38,  7.05s/it]
  6%|████████▋                                                                                                                                  | 2/32 [00:13<03:17,  6.57s/it]
                                                                                                                                                                               
{'loss': 0.033, 'grad_norm': 57.19621276855469, 'learning_rate': 0.001, 'ppl': 1.03355, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112571716308594, 'tokens/total': 18304, 'tokens/trainable': 86, 'epoch': 2.0}

  6%|████████▋                                                                                                                                  | 2/32 [00:13<03:17,  6.57s/it]
  9%|█████████████                                                                                                                              | 3/32 [00:19<03:06,  6.42s/it]
                                                                                                                                                                               
{'loss': 0.0321, 'grad_norm': 57.623077392578125, 'learning_rate': 0.001, 'ppl': 1.03262, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.122233867645264, 'tokens/total': 27456, 'tokens/trainable': 129, 'epoch': 3.0}

  9%|█████████████                                                                                                                              | 3/32 [00:19<03:06,  6.42s/it]
 12%|█████████████████▍                                                                                                                         | 4/32 [00:25<02:57,  6.35s/it]
                                                                                                                                                                               
{'loss': 0.0299, 'grad_norm': 63.824161529541016, 'learning_rate': 0.001, 'ppl': 1.03035, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.1125569343566895, 'tokens/total': 36608, 'tokens/trainable': 172, 'epoch': 4.0}

 12%|█████████████████▍                                                                                                                         | 4/32 [00:25<02:57,  6.35s/it]
 16%|█████████████████████▋                                                                                                                     | 5/32 [00:32<02:50,  6.31s/it]
                                                                                                                                                                               
{'loss': 0.03, 'grad_norm': 61.47892761230469, 'learning_rate': 0.001, 'ppl': 1.03045, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112518787384033, 'tokens/total': 45760, 'tokens/trainable': 215, 'epoch': 5.0}

 16%|█████████████████████▋                                                                                                                     | 5/32 [00:32<02:50,  6.31s/it]
 19%|██████████████████████████                                                                                                                 | 6/32 [00:38<02:43,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0242, 'grad_norm': 40.61567687988281, 'learning_rate': 0.001, 'ppl': 1.0245, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.104858875274658, 'tokens/total': 54912, 'tokens/trainable': 258, 'epoch': 6.0}

 19%|██████████████████████████                                                                                                                 | 6/32 [00:38<02:43,  6.29s/it]
 22%|██████████████████████████████▍                                                                                                            | 7/32 [00:44<02:36,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.0225, 'grad_norm': 31.520526885986328, 'learning_rate': 0.001, 'ppl': 1.02276, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.103605270385742, 'tokens/total': 64064, 'tokens/trainable': 301, 'epoch': 7.0}

 22%|██████████████████████████████▍                                                                                                            | 7/32 [00:44<02:36,  6.28s/it]
 25%|██████████████████████████████████▊                                                                                                        | 8/32 [00:50<02:30,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0217, 'grad_norm': 29.32663917541504, 'learning_rate': 0.001, 'ppl': 1.02194, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.096944332122803, 'tokens/total': 73216, 'tokens/trainable': 344, 'epoch': 8.0}

 25%|██████████████████████████████████▊                                                                                                        | 8/32 [00:50<02:30,  6.27s/it]
 28%|███████████████████████████████████████                                                                                                    | 9/32 [00:57<02:24,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0211, 'grad_norm': 26.701892852783203, 'learning_rate': 0.001, 'ppl': 1.02132, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.088647842407227, 'tokens/total': 82368, 'tokens/trainable': 387, 'epoch': 9.0}

 28%|███████████████████████████████████████                                                                                                    | 9/32 [00:57<02:24,  6.27s/it]
 31%|███████████████████████████████████████████▏                                                                                              | 10/32 [01:03<02:17,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0205, 'grad_norm': 24.277631759643555, 'learning_rate': 0.001, 'ppl': 1.02071, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084709167480469, 'tokens/total': 91520, 'tokens/trainable': 430, 'epoch': 10.0}

 31%|███████████████████████████████████████████▏                                                                                              | 10/32 [01:03<02:17,  6.27s/it]
 34%|███████████████████████████████████████████████▍                                                                                          | 11/32 [01:09<02:11,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.02, 'grad_norm': 24.709354400634766, 'learning_rate': 0.001, 'ppl': 1.0202, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084074020385742, 'tokens/total': 100672, 'tokens/trainable': 473, 'epoch': 11.0}

 34%|███████████████████████████████████████████████▍                                                                                          | 11/32 [01:09<02:11,  6.27s/it]
 38%|███████████████████████████████████████████████████▊                                                                                      | 12/32 [01:15<02:05,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0187, 'grad_norm': 23.36050033569336, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.071046829223633, 'tokens/total': 109824, 'tokens/trainable': 516, 'epoch': 12.0}

 38%|███████████████████████████████████████████████████▊                                                                                      | 12/32 [01:15<02:05,  6.27s/it]
 41%|████████████████████████████████████████████████████████                                                                                  | 13/32 [01:22<01:59,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0187, 'grad_norm': 25.07172393798828, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0752339363098145, 'tokens/total': 118976, 'tokens/trainable': 559, 'epoch': 13.0}

 41%|████████████████████████████████████████████████████████                                                                                  | 13/32 [01:22<01:59,  6.27s/it]
 44%|████████████████████████████████████████████████████████████▍                                                                             | 14/32 [01:28<01:52,  6.27s/it]
                                                                                                                                                                               
{'loss': 0.0172, 'grad_norm': 24.219331741333008, 'learning_rate': 0.001, 'ppl': 1.01735, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.075183391571045, 'tokens/total': 128128, 'tokens/trainable': 602, 'epoch': 14.0}

 44%|████████████████████████████████████████████████████████████▍                                                                             | 14/32 [01:28<01:52,  6.27s/it]
 47%|████████████████████████████████████████████████████████████████▋                                                                         | 15/32 [01:34<01:46,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.0166, 'grad_norm': 23.965293884277344, 'learning_rate': 0.001, 'ppl': 1.01674, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.073108673095703, 'tokens/total': 137280, 'tokens/trainable': 645, 'epoch': 15.0}

 47%|████████████████████████████████████████████████████████████████▋                                                                         | 15/32 [01:34<01:46,  6.28s/it]
 50%|█████████████████████████████████████████████████████████████████████                                                                     | 16/32 [01:40<01:40,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.0139, 'grad_norm': 21.725933074951172, 'learning_rate': 0.001, 'ppl': 1.014, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.072548866271973, 'tokens/total': 146432, 'tokens/trainable': 688, 'epoch': 16.0}

 50%|█████████████████████████████████████████████████████████████████████                                                                     | 16/32 [01:41<01:40,  6.28s/it]
 53%|█████████████████████████████████████████████████████████████████████████▎                                                                | 17/32 [01:47<01:34,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.013, 'grad_norm': 19.918394088745117, 'learning_rate': 0.001, 'ppl': 1.01308, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070837497711182, 'tokens/total': 155584, 'tokens/trainable': 731, 'epoch': 17.0}

 53%|█████████████████████████████████████████████████████████████████████████▎                                                                | 17/32 [01:47<01:34,  6.28s/it]
 56%|█████████████████████████████████████████████████████████████████████████████▋                                                            | 18/32 [01:53<01:28,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0111, 'grad_norm': 16.317699432373047, 'learning_rate': 0.001, 'ppl': 1.01116, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070370674133301, 'tokens/total': 164736, 'tokens/trainable': 774, 'epoch': 18.0}

 56%|█████████████████████████████████████████████████████████████████████████████▋                                                            | 18/32 [01:53<01:28,  6.29s/it]
 59%|█████████████████████████████████████████████████████████████████████████████████▉                                                        | 19/32 [01:59<01:21,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0105, 'grad_norm': 15.480484008789062, 'learning_rate': 0.001, 'ppl': 1.01056, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067584037780762, 'tokens/total': 173888, 'tokens/trainable': 817, 'epoch': 19.0}

 59%|█████████████████████████████████████████████████████████████████████████████████▉                                                        | 19/32 [01:59<01:21,  6.29s/it]
 62%|██████████████████████████████████████████████████████████████████████████████████████▎                                                   | 20/32 [02:06<01:15,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0092, 'grad_norm': 15.762852668762207, 'learning_rate': 0.001, 'ppl': 1.00924, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.068929195404053, 'tokens/total': 183040, 'tokens/trainable': 860, 'epoch': 20.0}

 62%|██████████████████████████████████████████████████████████████████████████████████████▎                                                   | 20/32 [02:06<01:15,  6.29s/it]
 66%|██████████████████████████████████████████████████████████████████████████████████████████▌                                               | 21/32 [02:12<01:09,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.0079, 'grad_norm': 11.2904691696167, 'learning_rate': 0.001, 'ppl': 1.00793, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069418907165527, 'tokens/total': 192192, 'tokens/trainable': 903, 'epoch': 21.0}

 66%|██████████████████████████████████████████████████████████████████████████████████████████▌                                               | 21/32 [02:12<01:09,  6.28s/it]
 69%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 22/32 [02:18<01:02,  6.28s/it]
                                                                                                                                                                               
{'loss': 0.0074, 'grad_norm': 10.677675247192383, 'learning_rate': 0.001, 'ppl': 1.00743, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0709052085876465, 'tokens/total': 201344, 'tokens/trainable': 946, 'epoch': 22.0}

 69%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 22/32 [02:18<01:02,  6.28s/it]
 72%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 23/32 [02:24<00:56,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0063, 'grad_norm': 8.554458618164062, 'learning_rate': 0.001, 'ppl': 1.00632, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069887638092041, 'tokens/total': 210496, 'tokens/trainable': 989, 'epoch': 23.0}

 72%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 23/32 [02:25<00:56,  6.29s/it]
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 24/32 [02:31<00:50,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0058, 'grad_norm': 7.792212009429932, 'learning_rate': 0.001, 'ppl': 1.00582, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.052520275115967, 'tokens/total': 219648, 'tokens/trainable': 1032, 'epoch': 24.0}

 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 24/32 [02:31<00:50,  6.29s/it]
 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 25/32 [02:37<00:44,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0047, 'grad_norm': 5.932632923126221, 'learning_rate': 0.001, 'ppl': 1.00471, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.059691429138184, 'tokens/total': 228800, 'tokens/trainable': 1075, 'epoch': 25.0}

 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 25/32 [02:37<00:44,  6.29s/it]
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 26/32 [02:43<00:37,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0046, 'grad_norm': 5.608907699584961, 'learning_rate': 0.001, 'ppl': 1.00461, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0648603439331055, 'tokens/total': 237952, 'tokens/trainable': 1118, 'epoch': 26.0}

 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 26/32 [02:43<00:37,  6.29s/it]
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 27/32 [02:50<00:31,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0043, 'grad_norm': 5.099766254425049, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.065439701080322, 'tokens/total': 247104, 'tokens/trainable': 1161, 'epoch': 27.0}

 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 27/32 [02:50<00:31,  6.29s/it]
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 28/32 [02:56<00:25,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0043, 'grad_norm': 4.663393020629883, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.063167095184326, 'tokens/total': 256256, 'tokens/trainable': 1204, 'epoch': 28.0}

 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 28/32 [02:56<00:25,  6.29s/it]
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 29/32 [03:02<00:18,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0033, 'grad_norm': 3.509425163269043, 'learning_rate': 0.001, 'ppl': 1.00331, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.064444065093994, 'tokens/total': 265408, 'tokens/trainable': 1247, 'epoch': 29.0}

 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 29/32 [03:02<00:18,  6.29s/it]
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 30/32 [03:09<00:12,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0034, 'grad_norm': 3.3978261947631836, 'learning_rate': 0.001, 'ppl': 1.00341, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067255973815918, 'tokens/total': 274560, 'tokens/trainable': 1290, 'epoch': 30.0}

 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 30/32 [03:09<00:12,  6.29s/it]
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 31/32 [03:15<00:06,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.0035, 'grad_norm': 3.4551568031311035, 'learning_rate': 0.001, 'ppl': 1.00351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.062354564666748, 'tokens/total': 283712, 'tokens/trainable': 1333, 'epoch': 31.0}

 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 31/32 [03:15<00:06,  6.29s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [03:21<00:00,  6.29s/it]
                                                                                                                                                                               
{'loss': 0.004, 'grad_norm': 4.433701515197754, 'learning_rate': 0.001, 'ppl': 1.00401, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0588459968566895, 'tokens/total': 292864, 'tokens/trainable': 1376, 'epoch': 32.0}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [03:21<00:00,  6.29s/it]
                                                                                                                                                                               
{'train_runtime': 201.6905, 'train_samples_per_second': 2.063, 'train_steps_per_second': 0.159, 'train_loss': 0.01493466420652112, 'memory/max_active (GiB)': 9.29, 'memory/max_allocated (GiB)': 9.29, 'memory/device_reserved (GiB)': 23.4, 'epoch': 32.0, 'tokens/train_per_sec_per_gpu': 0.0}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [03:21<00:00,  6.29s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [03:21<00:00,  6.30s/it]
[2026-01-25 09:59:35,422] [INFO] [axolotl.train.save_trained_model:233] [PID:13320] Training completed! Saving trained model to ./model-out.
[2026-01-25 09:59:48,526] [INFO] [axolotl.train.save_trained_model:351] [PID:13320] Model successfully saved to ./model-out