| [2026-01-25 09:34:01,866] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:10047] baseline 0.000GB () | |
| [2026-01-25 09:34:01,867] [INFO] [axolotl.cli.config.load_cfg:259] [PID:10047] config: | |
| { | |
| "activation_offloading": false, | |
| "axolotl_config_path": "train.yml", | |
| "base_model": "google/gemma-3-4b-it", | |
| "base_model_config": "google/gemma-3-4b-it", | |
| "batch_size": 13, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_86", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "context_parallel_size": 1, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_num_proc": 9, | |
| "datasets": [ | |
| { | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "AlexHung29629/MerlynIfeEldridge2", | |
| "trust_remote_code": false, | |
| "type": "input_output" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.9.1" | |
| }, | |
| "eval_batch_size": 13, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 1, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": false | |
| }, | |
| "include_tkps": true, | |
| "is_multimodal": true, | |
| "learning_rate": 0.001, | |
| "liger_fused_linear_cross_entropy": true, | |
| "liger_glu_activation": true, | |
| "liger_layer_norm": true, | |
| "liger_rms_norm": true, | |
| "liger_rope": true, | |
| "liger_use_token_scaling": true, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "lora_dropout": 0.0, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "constant", | |
| "max_grad_norm": 1.0, | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 13, | |
| "model_config_type": "gemma3", | |
| "num_epochs": 16.0, | |
| "optimizer": "sgd", | |
| "otel_metrics_host": "localhost", | |
| "otel_metrics_port": 8000, | |
| "output_dir": "./model-out", | |
| "plugins": [ | |
| "axolotl.integrations.liger.LigerPlugin" | |
| ], | |
| "pretrain_multipack_attn": true, | |
| "processor_config": "google/gemma-3-4b-it", | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing": false, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": false, | |
| "save_safetensors": true, | |
| "save_strategy": "no", | |
| "seed": 42, | |
| "sequence_len": 758, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tf32": true, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "google/gemma-3-4b-it", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "use_otel_metrics": false, | |
| "use_ray": false, | |
| "use_tensorboard": true, | |
| "use_wandb": false, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "warmup_ratio": 0.0, | |
| "weight_decay": 0.0, | |
| "world_size": 1 | |
| } | |
| [2026-01-25 09:34:01,983] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:10047] Loaded image size: 896 from model config | |
| [2026-01-25 09:34:03,852] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:10047] EOS: 1 / <eos> | |
| [2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:10047] BOS: 2 / <bos> | |
| [2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:10047] PAD: 0 / <pad> | |
| [2026-01-25 09:34:03,853] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:10047] UNK: 3 / <unk> | |
| [2026-01-25 09:34:03,853] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:10047] Unable to find prepared dataset in last_run_prepared/79c123e6ef0babe72cf6db37825069f8 | |
| [2026-01-25 09:34:03,854] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:10047] Loading raw datasets... | |
| [2026-01-25 09:34:03,854] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:10047] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. | |
| [2026-01-25 09:34:08,929] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:10047] Loading dataset: AlexHung29629/MerlynIfeEldridge2 with base_type: input_output and prompt_style: None | |
| [2026-01-25 09:34:09,525] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:10047] min_input_len: 152 | |
| [2026-01-25 09:34:09,526] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:10047] max_input_len: 676 | |
| Saving the dataset (0/1 shards): 0%| | 0/13 [00:00<?, ? examples/s] Saving the dataset (0/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 59.57 examples/s] Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 59.57 examples/s] Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 13/13 [00:00<00:00, 41.66 examples/s] | |
| [2026-01-25 09:34:10,159] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:10047] total_num_tokens: 4_827 | |
| [2026-01-25 09:34:10,162] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:10047] `total_supervised_tokens: 43` | |
| [2026-01-25 09:34:10,162] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:10047] total_num_steps: 16 | |
| [2026-01-25 09:34:10,163] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:10047] Maximum number of steps set at 16 | |
| [2026-01-25 09:34:10,283] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:10047] loading tokenizer... google/gemma-3-4b-it | |
| [2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:10047] EOS: 1 / <eos> | |
| [2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:10047] BOS: 2 / <bos> | |
| [2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:10047] PAD: 0 / <pad> | |
| [2026-01-25 09:34:12,371] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:10047] UNK: 3 / <unk> | |
| [2026-01-25 09:34:23,539] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:10047] Loading model | |
| [2026-01-25 09:34:23,660] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:10047] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2026-01-25 09:34:23,662] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:10047] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2026-01-25 09:34:23,793] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:10047] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True} | |
| Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1/2 [00:01<00:01, 1.58s/it] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:02<00:00, 1.18s/it] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:02<00:00, 1.24s/it] | |
| [2026-01-25 09:34:32,161] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:10047] Memory usage after model load 0.000GB () | |
| [2026-01-25 09:35:21,180] [INFO] [axolotl.train.save_initial_configs:417] [PID:10047] Pre-saving tokenizer to ./model-out... | |
| [2026-01-25 09:35:21,802] [INFO] [axolotl.train.save_initial_configs:422] [PID:10047] Pre-saving model config to ./model-out... | |
| [2026-01-25 09:35:21,809] [INFO] [axolotl.train.save_initial_configs:426] [PID:10047] Pre-saving processor to ./model-out... | |
| [2026-01-25 09:35:25,524] [INFO] [axolotl.train.execute_training:212] [PID:10047] Starting trainer... | |
| 0%| | 0/16 [00:00<?, ?it/s] 6%|βββββββββ | 1/16 [00:07<01:45, 7.05s/it] {'loss': 0.0345, 'grad_norm': 61.53063201904297, 'learning_rate': 0.001, 'ppl': 1.0351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 6.187943458557129, 'tokens/total': 9152, 'tokens/trainable': 43, 'epoch': 1.0} | |
| 6%|βββββββββ | 1/16 [00:07<01:45, 7.05s/it] 12%|ββββββββββββββββββ | 2/16 [00:13<01:31, 6.54s/it] {'loss': 0.033, 'grad_norm': 57.19621276855469, 'learning_rate': 0.001, 'ppl': 1.03355, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.120673656463623, 'tokens/total': 18304, 'tokens/trainable': 86, 'epoch': 2.0} | |
| 12%|ββββββββββββββββββ | 2/16 [00:13<01:31, 6.54s/it] 19%|ββββββββββββββββββββββββββ | 3/16 [00:19<01:23, 6.39s/it] {'loss': 0.0321, 'grad_norm': 57.623077392578125, 'learning_rate': 0.001, 'ppl': 1.03262, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.125824928283691, 'tokens/total': 27456, 'tokens/trainable': 129, 'epoch': 3.0} | |
| 19%|ββββββββββββββββββββββββββ | 3/16 [00:19<01:23, 6.39s/it] 25%|βββββββββββββββββββββββββββββββββββ | 4/16 [00:25<01:15, 6.33s/it] {'loss': 0.0299, 'grad_norm': 63.824161529541016, 'learning_rate': 0.001, 'ppl': 1.03035, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.115628719329834, 'tokens/total': 36608, 'tokens/trainable': 172, 'epoch': 4.0} | |
| 25%|βββββββββββββββββββββββββββββββββββ | 4/16 [00:25<01:15, 6.33s/it] 31%|ββββββββββββββββββββββββββββββββββββββββββββ | 5/16 [00:31<01:09, 6.29s/it] {'loss': 0.03, 'grad_norm': 61.47892761230469, 'learning_rate': 0.001, 'ppl': 1.03045, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.117379665374756, 'tokens/total': 45760, 'tokens/trainable': 215, 'epoch': 5.0} | |
| 31%|ββββββββββββββββββββββββββββββββββββββββββββ | 5/16 [00:32<01:09, 6.29s/it] 38%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 6/16 [00:38<01:02, 6.27s/it] {'loss': 0.0242, 'grad_norm': 40.61567687988281, 'learning_rate': 0.001, 'ppl': 1.0245, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.108977317810059, 'tokens/total': 54912, 'tokens/trainable': 258, 'epoch': 6.0} | |
| 38%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 6/16 [00:38<01:02, 6.27s/it] 44%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 7/16 [00:44<00:56, 6.26s/it] {'loss': 0.0225, 'grad_norm': 31.520526885986328, 'learning_rate': 0.001, 'ppl': 1.02276, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.105500221252441, 'tokens/total': 64064, 'tokens/trainable': 301, 'epoch': 7.0} | |
| 44%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 7/16 [00:44<00:56, 6.26s/it] 50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 8/16 [00:50<00:50, 6.26s/it] {'loss': 0.0217, 'grad_norm': 29.32663917541504, 'learning_rate': 0.001, 'ppl': 1.02194, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.099156379699707, 'tokens/total': 73216, 'tokens/trainable': 344, 'epoch': 8.0} | |
| 50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 8/16 [00:50<00:50, 6.26s/it] 56%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 9/16 [00:56<00:43, 6.26s/it] {'loss': 0.0211, 'grad_norm': 26.701892852783203, 'learning_rate': 0.001, 'ppl': 1.02132, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0905585289001465, 'tokens/total': 82368, 'tokens/trainable': 387, 'epoch': 9.0} | |
| 56%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 9/16 [00:56<00:43, 6.26s/it] 62%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 10/16 [01:03<00:37, 6.26s/it] {'loss': 0.0205, 'grad_norm': 24.277631759643555, 'learning_rate': 0.001, 'ppl': 1.02071, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.076664447784424, 'tokens/total': 91520, 'tokens/trainable': 430, 'epoch': 10.0} | |
| 62%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 10/16 [01:03<00:37, 6.26s/it] 69%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 11/16 [01:09<00:31, 6.26s/it] {'loss': 0.02, 'grad_norm': 24.709354400634766, 'learning_rate': 0.001, 'ppl': 1.0202, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.076472759246826, 'tokens/total': 100672, 'tokens/trainable': 473, 'epoch': 11.0} | |
| 69%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 11/16 [01:09<00:31, 6.26s/it] 75%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 12/16 [01:15<00:25, 6.27s/it] {'loss': 0.0187, 'grad_norm': 23.36050033569336, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.066161155700684, 'tokens/total': 109824, 'tokens/trainable': 516, 'epoch': 12.0} | |
| 75%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 12/16 [01:15<00:25, 6.27s/it] 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 13/16 [01:21<00:18, 6.27s/it] {'loss': 0.0187, 'grad_norm': 25.07172393798828, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.074800968170166, 'tokens/total': 118976, 'tokens/trainable': 559, 'epoch': 13.0} | |
| 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 13/16 [01:22<00:18, 6.27s/it] 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 14/16 [01:28<00:12, 6.27s/it] {'loss': 0.0172, 'grad_norm': 24.219331741333008, 'learning_rate': 0.001, 'ppl': 1.01735, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.072431564331055, 'tokens/total': 128128, 'tokens/trainable': 602, 'epoch': 14.0} | |
| 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 14/16 [01:28<00:12, 6.27s/it] 94%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 15/16 [01:34<00:06, 6.26s/it] {'loss': 0.0166, 'grad_norm': 23.965293884277344, 'learning_rate': 0.001, 'ppl': 1.01674, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.07435417175293, 'tokens/total': 137280, 'tokens/trainable': 645, 'epoch': 15.0} | |
| 94%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 15/16 [01:34<00:06, 6.26s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [01:40<00:00, 6.27s/it] {'loss': 0.0139, 'grad_norm': 21.725933074951172, 'learning_rate': 0.001, 'ppl': 1.014, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0699357986450195, 'tokens/total': 146432, 'tokens/trainable': 688, 'epoch': 16.0} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [01:40<00:00, 6.27s/it] {'train_runtime': 100.8727, 'train_samples_per_second': 2.062, 'train_steps_per_second': 0.159, 'train_loss': 0.023414524795953184, 'memory/max_active (GiB)': 9.29, 'memory/max_allocated (GiB)': 9.29, 'memory/device_reserved (GiB)': 23.4, 'epoch': 16.0, 'tokens/train_per_sec_per_gpu': 0.0} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [01:40<00:00, 6.27s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [01:40<00:00, 6.30s/it] | |
| [2026-01-25 09:37:06,886] [INFO] [axolotl.train.save_trained_model:233] [PID:10047] Training completed! Saving trained model to ./model-out. | |
| [2026-01-25 09:37:19,416] [INFO] [axolotl.train.save_trained_model:351] [PID:10047] Model successfully saved to ./model-out | |