| [2026-01-25 09:54:39,812] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:13320] baseline 0.000GB () | |
| [2026-01-25 09:54:39,813] [INFO] [axolotl.cli.config.load_cfg:259] [PID:13320] config: | |
| { | |
| "activation_offloading": false, | |
| "axolotl_config_path": "train.yml", | |
| "base_model": "google/gemma-3-4b-it", | |
| "base_model_config": "google/gemma-3-4b-it", | |
| "batch_size": 13, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_86", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "context_parallel_size": 1, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_num_proc": 9, | |
| "datasets": [ | |
| { | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "AlexHung29629/MerlynIfeEldridge2", | |
| "trust_remote_code": false, | |
| "type": "input_output" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.9.1" | |
| }, | |
| "eval_batch_size": 13, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 1, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": false | |
| }, | |
| "include_tkps": true, | |
| "is_multimodal": true, | |
| "learning_rate": 0.001, | |
| "liger_fused_linear_cross_entropy": true, | |
| "liger_glu_activation": true, | |
| "liger_layer_norm": true, | |
| "liger_rms_norm": true, | |
| "liger_rope": true, | |
| "liger_use_token_scaling": true, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "lora_dropout": 0.0, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "constant", | |
| "max_grad_norm": 1.0, | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 13, | |
| "model_config_type": "gemma3", | |
| "num_epochs": 32.0, | |
| "optimizer": "sgd", | |
| "otel_metrics_host": "localhost", | |
| "otel_metrics_port": 8000, | |
| "output_dir": "./model-out", | |
| "plugins": [ | |
| "axolotl.integrations.liger.LigerPlugin" | |
| ], | |
| "pretrain_multipack_attn": true, | |
| "processor_config": "google/gemma-3-4b-it", | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing": false, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": false, | |
| "save_safetensors": true, | |
| "save_strategy": "no", | |
| "seed": 42, | |
| "sequence_len": 758, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tf32": true, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "google/gemma-3-4b-it", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "use_otel_metrics": false, | |
| "use_ray": false, | |
| "use_tensorboard": true, | |
| "use_wandb": false, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "warmup_ratio": 0.0, | |
| "weight_decay": 0.0, | |
| "world_size": 1 | |
| } | |
| [2026-01-25 09:54:39,935] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:13320] Loaded image size: 896 from model config | |
| [2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos> | |
| [2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos> | |
| [2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad> | |
| [2026-01-25 09:54:42,062] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk> | |
| [2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:13320] Unable to find prepared dataset in last_run_prepared/79c123e6ef0babe72cf6db37825069f8 | |
| [2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:13320] Loading raw datasets... | |
| [2026-01-25 09:54:42,063] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:13320] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. | |
| [2026-01-25 09:54:42,948] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:13320] Loading dataset: AlexHung29629/MerlynIfeEldridge2 with base_type: input_output and prompt_style: None | |
| [2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:13320] min_input_len: 152 | |
| [2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:13320] max_input_len: 676 | |
| Saving the dataset (0/1 shards): 0%| | 0/13 [00:00<?, ? examples/s] Saving the dataset (0/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 63.38 examples/s] Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 63.38 examples/s] Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 45.17 examples/s] | |
| [2026-01-25 09:54:43,829] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:13320] total_num_tokens: 4_827 | |
| [2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:13320] `total_supervised_tokens: 43` | |
| [2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:13320] total_num_steps: 32 | |
| [2026-01-25 09:54:43,832] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:13320] Maximum number of steps set at 32 | |
| [2026-01-25 09:54:43,942] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:13320] loading tokenizer... google/gemma-3-4b-it | |
| [2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos> | |
| [2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos> | |
| [2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad> | |
| [2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk> | |
| [2026-01-25 09:54:54,079] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:13320] Loading model | |
| [2026-01-25 09:54:54,167] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:13320] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2026-01-25 09:54:54,169] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:13320] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2026-01-25 09:54:54,266] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:13320] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True} | |
| Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1/2 [00:01<00:01, 1.96s/it] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:03<00:00, 1.54s/it] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:03<00:00, 1.60s/it] | |
| [2026-01-25 09:55:10,541] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:13320] Memory usage after model load 0.000GB () | |
| [2026-01-25 09:56:09,234] [INFO] [axolotl.train.save_initial_configs:417] [PID:13320] Pre-saving tokenizer to ./model-out... | |
| [2026-01-25 09:56:09,770] [INFO] [axolotl.train.save_initial_configs:422] [PID:13320] Pre-saving model config to ./model-out... | |
| [2026-01-25 09:56:09,777] [INFO] [axolotl.train.save_initial_configs:426] [PID:13320] Pre-saving processor to ./model-out... | |
| [2026-01-25 09:56:13,230] [INFO] [axolotl.train.execute_training:212] [PID:13320] Starting trainer... | |
| 0%| | 0/32 [00:00<?, ?it/s] 3%|βββββ | 1/32 [00:07<03:38, 7.05s/it] {'loss': 0.0345, 'grad_norm': 61.53063201904297, 'learning_rate': 0.001, 'ppl': 1.0351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 6.18751859664917, 'tokens/total': 9152, 'tokens/trainable': 43, 'epoch': 1.0} | |
| 3%|βββββ | 1/32 [00:07<03:38, 7.05s/it] 6%|βββββββββ | 2/32 [00:13<03:17, 6.57s/it] {'loss': 0.033, 'grad_norm': 57.19621276855469, 'learning_rate': 0.001, 'ppl': 1.03355, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112571716308594, 'tokens/total': 18304, 'tokens/trainable': 86, 'epoch': 2.0} | |
| 6%|βββββββββ | 2/32 [00:13<03:17, 6.57s/it] 9%|βββββββββββββ | 3/32 [00:19<03:06, 6.42s/it] {'loss': 0.0321, 'grad_norm': 57.623077392578125, 'learning_rate': 0.001, 'ppl': 1.03262, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.122233867645264, 'tokens/total': 27456, 'tokens/trainable': 129, 'epoch': 3.0} | |
| 9%|βββββββββββββ | 3/32 [00:19<03:06, 6.42s/it] 12%|ββββββββββββββββββ | 4/32 [00:25<02:57, 6.35s/it] {'loss': 0.0299, 'grad_norm': 63.824161529541016, 'learning_rate': 0.001, 'ppl': 1.03035, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.1125569343566895, 'tokens/total': 36608, 'tokens/trainable': 172, 'epoch': 4.0} | |
| 12%|ββββββββββββββββββ | 4/32 [00:25<02:57, 6.35s/it] 16%|ββββββββββββββββββββββ | 5/32 [00:32<02:50, 6.31s/it] {'loss': 0.03, 'grad_norm': 61.47892761230469, 'learning_rate': 0.001, 'ppl': 1.03045, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112518787384033, 'tokens/total': 45760, 'tokens/trainable': 215, 'epoch': 5.0} | |
| 16%|ββββββββββββββββββββββ | 5/32 [00:32<02:50, 6.31s/it] 19%|ββββββββββββββββββββββββββ | 6/32 [00:38<02:43, 6.29s/it] {'loss': 0.0242, 'grad_norm': 40.61567687988281, 'learning_rate': 0.001, 'ppl': 1.0245, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.104858875274658, 'tokens/total': 54912, 'tokens/trainable': 258, 'epoch': 6.0} | |
| 19%|ββββββββββββββββββββββββββ | 6/32 [00:38<02:43, 6.29s/it] 22%|βββββββββββββββββββββββββββββββ | 7/32 [00:44<02:36, 6.28s/it] {'loss': 0.0225, 'grad_norm': 31.520526885986328, 'learning_rate': 0.001, 'ppl': 1.02276, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.103605270385742, 'tokens/total': 64064, 'tokens/trainable': 301, 'epoch': 7.0} | |
| 22%|βββββββββββββββββββββββββββββββ | 7/32 [00:44<02:36, 6.28s/it] 25%|βββββββββββββββββββββββββββββββββββ | 8/32 [00:50<02:30, 6.27s/it] {'loss': 0.0217, 'grad_norm': 29.32663917541504, 'learning_rate': 0.001, 'ppl': 1.02194, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.096944332122803, 'tokens/total': 73216, 'tokens/trainable': 344, 'epoch': 8.0} | |
| 25%|βββββββββββββββββββββββββββββββββββ | 8/32 [00:50<02:30, 6.27s/it] 28%|βββββββββββββββββββββββββββββββββββββββ | 9/32 [00:57<02:24, 6.27s/it] {'loss': 0.0211, 'grad_norm': 26.701892852783203, 'learning_rate': 0.001, 'ppl': 1.02132, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.088647842407227, 'tokens/total': 82368, 'tokens/trainable': 387, 'epoch': 9.0} | |
| 28%|βββββββββββββββββββββββββββββββββββββββ | 9/32 [00:57<02:24, 6.27s/it] 31%|ββββββββββββββββββββββββββββββββββββββββββββ | 10/32 [01:03<02:17, 6.27s/it] {'loss': 0.0205, 'grad_norm': 24.277631759643555, 'learning_rate': 0.001, 'ppl': 1.02071, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084709167480469, 'tokens/total': 91520, 'tokens/trainable': 430, 'epoch': 10.0} | |
| 31%|ββββββββββββββββββββββββββββββββββββββββββββ | 10/32 [01:03<02:17, 6.27s/it] 34%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 11/32 [01:09<02:11, 6.27s/it] {'loss': 0.02, 'grad_norm': 24.709354400634766, 'learning_rate': 0.001, 'ppl': 1.0202, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084074020385742, 'tokens/total': 100672, 'tokens/trainable': 473, 'epoch': 11.0} | |
| 34%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 11/32 [01:09<02:11, 6.27s/it] 38%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 12/32 [01:15<02:05, 6.27s/it] {'loss': 0.0187, 'grad_norm': 23.36050033569336, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.071046829223633, 'tokens/total': 109824, 'tokens/trainable': 516, 'epoch': 12.0} | |
| 38%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 12/32 [01:15<02:05, 6.27s/it] 41%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 13/32 [01:22<01:59, 6.27s/it] {'loss': 0.0187, 'grad_norm': 25.07172393798828, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0752339363098145, 'tokens/total': 118976, 'tokens/trainable': 559, 'epoch': 13.0} | |
| 41%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 13/32 [01:22<01:59, 6.27s/it] 44%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 14/32 [01:28<01:52, 6.27s/it] {'loss': 0.0172, 'grad_norm': 24.219331741333008, 'learning_rate': 0.001, 'ppl': 1.01735, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.075183391571045, 'tokens/total': 128128, 'tokens/trainable': 602, 'epoch': 14.0} | |
| 44%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 14/32 [01:28<01:52, 6.27s/it] 47%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 15/32 [01:34<01:46, 6.28s/it] {'loss': 0.0166, 'grad_norm': 23.965293884277344, 'learning_rate': 0.001, 'ppl': 1.01674, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.073108673095703, 'tokens/total': 137280, 'tokens/trainable': 645, 'epoch': 15.0} | |
| 47%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 15/32 [01:34<01:46, 6.28s/it] 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 16/32 [01:40<01:40, 6.28s/it] {'loss': 0.0139, 'grad_norm': 21.725933074951172, 'learning_rate': 0.001, 'ppl': 1.014, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.072548866271973, 'tokens/total': 146432, 'tokens/trainable': 688, 'epoch': 16.0} | |
| 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 16/32 [01:41<01:40, 6.28s/it] 53%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 17/32 [01:47<01:34, 6.28s/it] {'loss': 0.013, 'grad_norm': 19.918394088745117, 'learning_rate': 0.001, 'ppl': 1.01308, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070837497711182, 'tokens/total': 155584, 'tokens/trainable': 731, 'epoch': 17.0} | |
| 53%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 17/32 [01:47<01:34, 6.28s/it] 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 18/32 [01:53<01:28, 6.29s/it] {'loss': 0.0111, 'grad_norm': 16.317699432373047, 'learning_rate': 0.001, 'ppl': 1.01116, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070370674133301, 'tokens/total': 164736, 'tokens/trainable': 774, 'epoch': 18.0} | |
| 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 18/32 [01:53<01:28, 6.29s/it] 59%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 19/32 [01:59<01:21, 6.29s/it] {'loss': 0.0105, 'grad_norm': 15.480484008789062, 'learning_rate': 0.001, 'ppl': 1.01056, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067584037780762, 'tokens/total': 173888, 'tokens/trainable': 817, 'epoch': 19.0} | |
| 59%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 19/32 [01:59<01:21, 6.29s/it] 62%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 20/32 [02:06<01:15, 6.29s/it] {'loss': 0.0092, 'grad_norm': 15.762852668762207, 'learning_rate': 0.001, 'ppl': 1.00924, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.068929195404053, 'tokens/total': 183040, 'tokens/trainable': 860, 'epoch': 20.0} | |
| 62%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 20/32 [02:06<01:15, 6.29s/it] 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 21/32 [02:12<01:09, 6.28s/it] {'loss': 0.0079, 'grad_norm': 11.2904691696167, 'learning_rate': 0.001, 'ppl': 1.00793, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069418907165527, 'tokens/total': 192192, 'tokens/trainable': 903, 'epoch': 21.0} | |
| 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 21/32 [02:12<01:09, 6.28s/it] 69%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 22/32 [02:18<01:02, 6.28s/it] {'loss': 0.0074, 'grad_norm': 10.677675247192383, 'learning_rate': 0.001, 'ppl': 1.00743, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0709052085876465, 'tokens/total': 201344, 'tokens/trainable': 946, 'epoch': 22.0} | |
| 69%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 22/32 [02:18<01:02, 6.28s/it] 72%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 23/32 [02:24<00:56, 6.29s/it] {'loss': 0.0063, 'grad_norm': 8.554458618164062, 'learning_rate': 0.001, 'ppl': 1.00632, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069887638092041, 'tokens/total': 210496, 'tokens/trainable': 989, 'epoch': 23.0} | |
| 72%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 23/32 [02:25<00:56, 6.29s/it] 75%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 24/32 [02:31<00:50, 6.29s/it] {'loss': 0.0058, 'grad_norm': 7.792212009429932, 'learning_rate': 0.001, 'ppl': 1.00582, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.052520275115967, 'tokens/total': 219648, 'tokens/trainable': 1032, 'epoch': 24.0} | |
| 75%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 24/32 [02:31<00:50, 6.29s/it] 78%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 25/32 [02:37<00:44, 6.29s/it] {'loss': 0.0047, 'grad_norm': 5.932632923126221, 'learning_rate': 0.001, 'ppl': 1.00471, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.059691429138184, 'tokens/total': 228800, 'tokens/trainable': 1075, 'epoch': 25.0} | |
| 78%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 25/32 [02:37<00:44, 6.29s/it] 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 26/32 [02:43<00:37, 6.29s/it] {'loss': 0.0046, 'grad_norm': 5.608907699584961, 'learning_rate': 0.001, 'ppl': 1.00461, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0648603439331055, 'tokens/total': 237952, 'tokens/trainable': 1118, 'epoch': 26.0} | |
| 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 26/32 [02:43<00:37, 6.29s/it] 84%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 27/32 [02:50<00:31, 6.29s/it] {'loss': 0.0043, 'grad_norm': 5.099766254425049, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.065439701080322, 'tokens/total': 247104, 'tokens/trainable': 1161, 'epoch': 27.0} | |
| 84%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 27/32 [02:50<00:31, 6.29s/it] 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 28/32 [02:56<00:25, 6.29s/it] {'loss': 0.0043, 'grad_norm': 4.663393020629883, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.063167095184326, 'tokens/total': 256256, 'tokens/trainable': 1204, 'epoch': 28.0} | |
| 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 28/32 [02:56<00:25, 6.29s/it] 91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 29/32 [03:02<00:18, 6.29s/it] {'loss': 0.0033, 'grad_norm': 3.509425163269043, 'learning_rate': 0.001, 'ppl': 1.00331, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.064444065093994, 'tokens/total': 265408, 'tokens/trainable': 1247, 'epoch': 29.0} | |
| 91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 29/32 [03:02<00:18, 6.29s/it] 94%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 30/32 [03:09<00:12, 6.29s/it] {'loss': 0.0034, 'grad_norm': 3.3978261947631836, 'learning_rate': 0.001, 'ppl': 1.00341, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067255973815918, 'tokens/total': 274560, 'tokens/trainable': 1290, 'epoch': 30.0} | |
| 94%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 30/32 [03:09<00:12, 6.29s/it] 97%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 31/32 [03:15<00:06, 6.29s/it] {'loss': 0.0035, 'grad_norm': 3.4551568031311035, 'learning_rate': 0.001, 'ppl': 1.00351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.062354564666748, 'tokens/total': 283712, 'tokens/trainable': 1333, 'epoch': 31.0} | |
| 97%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 31/32 [03:15<00:06, 6.29s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 32/32 [03:21<00:00, 6.29s/it] {'loss': 0.004, 'grad_norm': 4.433701515197754, 'learning_rate': 0.001, 'ppl': 1.00401, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0588459968566895, 'tokens/total': 292864, 'tokens/trainable': 1376, 'epoch': 32.0} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 32/32 [03:21<00:00, 6.29s/it] {'train_runtime': 201.6905, 'train_samples_per_second': 2.063, 'train_steps_per_second': 0.159, 'train_loss': 0.01493466420652112, 'memory/max_active (GiB)': 9.29, 'memory/max_allocated (GiB)': 9.29, 'memory/device_reserved (GiB)': 23.4, 'epoch': 32.0, 'tokens/train_per_sec_per_gpu': 0.0} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 32/32 [03:21<00:00, 6.29s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 32/32 [03:21<00:00, 6.30s/it] | |
| [2026-01-25 09:59:35,422] [INFO] [axolotl.train.save_trained_model:233] [PID:13320] Training completed! Saving trained model to ./model-out. | |
| [2026-01-25 09:59:48,526] [INFO] [axolotl.train.save_trained_model:351] [PID:13320] Model successfully saved to ./model-out | |