Upload folder using huggingface_hub

9514460 verified 1 day ago

48.2 kB

	[2026-01-25 09:54:39,812] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:13320] baseline 0.000GB ()
	[2026-01-25 09:54:39,813] [INFO] [axolotl.cli.config.load_cfg:259] [PID:13320] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "train.yml",
	"base_model": "google/gemma-3-4b-it",
	"base_model_config": "google/gemma-3-4b-it",
	"batch_size": 13,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 9,
	"datasets": [
	{
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "AlexHung29629/MerlynIfeEldridge2",
	"trust_remote_code": false,
	"type": "input_output"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.9.1"
	},
	"eval_batch_size": 13,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"fp16": false,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": false
	},
	"include_tkps": true,
	"is_multimodal": true,
	"learning_rate": 0.001,
	"liger_fused_linear_cross_entropy": true,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"liger_use_token_scaling": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "constant",
	"max_grad_norm": 1.0,
	"mean_resizing_embeddings": false,
	"micro_batch_size": 13,
	"model_config_type": "gemma3",
	"num_epochs": 32.0,
	"optimizer": "sgd",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./model-out",
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin"
	],
	"pretrain_multipack_attn": true,
	"processor_config": "google/gemma-3-4b-it",
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": false,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_strategy": "no",
	"seed": 42,
	"sequence_len": 758,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": true,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "google/gemma-3-4b-it",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_otel_metrics": false,
	"use_ray": false,
	"use_tensorboard": true,
	"use_wandb": false,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"warmup_ratio": 0.0,
	"weight_decay": 0.0,
	"world_size": 1
	}
	[2026-01-25 09:54:39,935] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:13320] Loaded image size: 896 from model config
	[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos>
	[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos>
	[2026-01-25 09:54:42,061] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad>
	[2026-01-25 09:54:42,062] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk>
	[2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:13320] Unable to find prepared dataset in last_run_prepared/79c123e6ef0babe72cf6db37825069f8
	[2026-01-25 09:54:42,063] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:13320] Loading raw datasets...
	[2026-01-25 09:54:42,063] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:13320] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	[2026-01-25 09:54:42,948] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:13320] Loading dataset: AlexHung29629/MerlynIfeEldridge2 with base_type: input_output and prompt_style: None
	[2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:13320] min_input_len: 152
	[2026-01-25 09:54:43,364] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:13320] max_input_len: 676
	Saving the dataset (0/1 shards): 0%\| \| 0/13 [00:00<?, ? examples/s] Saving the dataset (0/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 63.38 examples/s] Saving the dataset (1/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 63.38 examples/s] Saving the dataset (1/1 shards): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████\| 13/13 [00:00<00:00, 45.17 examples/s]
	[2026-01-25 09:54:43,829] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:13320] total_num_tokens: 4_827
	[2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:13320] `total_supervised_tokens: 43`
	[2026-01-25 09:54:43,831] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:13320] total_num_steps: 32
	[2026-01-25 09:54:43,832] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:13320] Maximum number of steps set at 32
	[2026-01-25 09:54:43,942] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:13320] loading tokenizer... google/gemma-3-4b-it
	[2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:13320] EOS: 1 / <eos>
	[2026-01-25 09:54:45,705] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:13320] BOS: 2 / <bos>
	[2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:13320] PAD: 0 / <pad>
	[2026-01-25 09:54:45,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:13320] UNK: 3 / <unk>
	[2026-01-25 09:54:54,079] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:13320] Loading model
	[2026-01-25 09:54:54,167] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:13320] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-01-25 09:54:54,169] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:13320] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-01-25 09:54:54,266] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:13320] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True}
	Loading checkpoint shards: 0%\| \| 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%\|████████████████████████████████████████████████████████▌ \| 1/2 [00:01<00:01, 1.96s/it] Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:03<00:00, 1.54s/it] Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:03<00:00, 1.60s/it]
	[2026-01-25 09:55:10,541] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:13320] Memory usage after model load 0.000GB ()
	[2026-01-25 09:56:09,234] [INFO] [axolotl.train.save_initial_configs:417] [PID:13320] Pre-saving tokenizer to ./model-out...
	[2026-01-25 09:56:09,770] [INFO] [axolotl.train.save_initial_configs:422] [PID:13320] Pre-saving model config to ./model-out...
	[2026-01-25 09:56:09,777] [INFO] [axolotl.train.save_initial_configs:426] [PID:13320] Pre-saving processor to ./model-out...
	[2026-01-25 09:56:13,230] [INFO] [axolotl.train.execute_training:212] [PID:13320] Starting trainer...
	0%\| \| 0/32 [00:00<?, ?it/s] 3%\|████▎ \| 1/32 [00:07<03:38, 7.05s/it] {'loss': 0.0345, 'grad_norm': 61.53063201904297, 'learning_rate': 0.001, 'ppl': 1.0351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 6.18751859664917, 'tokens/total': 9152, 'tokens/trainable': 43, 'epoch': 1.0}
	3%\|████▎ \| 1/32 [00:07<03:38, 7.05s/it] 6%\|████████▋ \| 2/32 [00:13<03:17, 6.57s/it] {'loss': 0.033, 'grad_norm': 57.19621276855469, 'learning_rate': 0.001, 'ppl': 1.03355, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112571716308594, 'tokens/total': 18304, 'tokens/trainable': 86, 'epoch': 2.0}
	6%\|████████▋ \| 2/32 [00:13<03:17, 6.57s/it] 9%\|█████████████ \| 3/32 [00:19<03:06, 6.42s/it] {'loss': 0.0321, 'grad_norm': 57.623077392578125, 'learning_rate': 0.001, 'ppl': 1.03262, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.122233867645264, 'tokens/total': 27456, 'tokens/trainable': 129, 'epoch': 3.0}
	9%\|█████████████ \| 3/32 [00:19<03:06, 6.42s/it] 12%\|█████████████████▍ \| 4/32 [00:25<02:57, 6.35s/it] {'loss': 0.0299, 'grad_norm': 63.824161529541016, 'learning_rate': 0.001, 'ppl': 1.03035, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.1125569343566895, 'tokens/total': 36608, 'tokens/trainable': 172, 'epoch': 4.0}
	12%\|█████████████████▍ \| 4/32 [00:25<02:57, 6.35s/it] 16%\|█████████████████████▋ \| 5/32 [00:32<02:50, 6.31s/it] {'loss': 0.03, 'grad_norm': 61.47892761230469, 'learning_rate': 0.001, 'ppl': 1.03045, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.112518787384033, 'tokens/total': 45760, 'tokens/trainable': 215, 'epoch': 5.0}
	16%\|█████████████████████▋ \| 5/32 [00:32<02:50, 6.31s/it] 19%\|██████████████████████████ \| 6/32 [00:38<02:43, 6.29s/it] {'loss': 0.0242, 'grad_norm': 40.61567687988281, 'learning_rate': 0.001, 'ppl': 1.0245, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.104858875274658, 'tokens/total': 54912, 'tokens/trainable': 258, 'epoch': 6.0}
	19%\|██████████████████████████ \| 6/32 [00:38<02:43, 6.29s/it] 22%\|██████████████████████████████▍ \| 7/32 [00:44<02:36, 6.28s/it] {'loss': 0.0225, 'grad_norm': 31.520526885986328, 'learning_rate': 0.001, 'ppl': 1.02276, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.103605270385742, 'tokens/total': 64064, 'tokens/trainable': 301, 'epoch': 7.0}
	22%\|██████████████████████████████▍ \| 7/32 [00:44<02:36, 6.28s/it] 25%\|██████████████████████████████████▊ \| 8/32 [00:50<02:30, 6.27s/it] {'loss': 0.0217, 'grad_norm': 29.32663917541504, 'learning_rate': 0.001, 'ppl': 1.02194, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.096944332122803, 'tokens/total': 73216, 'tokens/trainable': 344, 'epoch': 8.0}
	25%\|██████████████████████████████████▊ \| 8/32 [00:50<02:30, 6.27s/it] 28%\|███████████████████████████████████████ \| 9/32 [00:57<02:24, 6.27s/it] {'loss': 0.0211, 'grad_norm': 26.701892852783203, 'learning_rate': 0.001, 'ppl': 1.02132, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.088647842407227, 'tokens/total': 82368, 'tokens/trainable': 387, 'epoch': 9.0}
	28%\|███████████████████████████████████████ \| 9/32 [00:57<02:24, 6.27s/it] 31%\|███████████████████████████████████████████▏ \| 10/32 [01:03<02:17, 6.27s/it] {'loss': 0.0205, 'grad_norm': 24.277631759643555, 'learning_rate': 0.001, 'ppl': 1.02071, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084709167480469, 'tokens/total': 91520, 'tokens/trainable': 430, 'epoch': 10.0}
	31%\|███████████████████████████████████████████▏ \| 10/32 [01:03<02:17, 6.27s/it] 34%\|███████████████████████████████████████████████▍ \| 11/32 [01:09<02:11, 6.27s/it] {'loss': 0.02, 'grad_norm': 24.709354400634766, 'learning_rate': 0.001, 'ppl': 1.0202, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.084074020385742, 'tokens/total': 100672, 'tokens/trainable': 473, 'epoch': 11.0}
	34%\|███████████████████████████████████████████████▍ \| 11/32 [01:09<02:11, 6.27s/it] 38%\|███████████████████████████████████████████████████▊ \| 12/32 [01:15<02:05, 6.27s/it] {'loss': 0.0187, 'grad_norm': 23.36050033569336, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.071046829223633, 'tokens/total': 109824, 'tokens/trainable': 516, 'epoch': 12.0}
	38%\|███████████████████████████████████████████████████▊ \| 12/32 [01:15<02:05, 6.27s/it] 41%\|████████████████████████████████████████████████████████ \| 13/32 [01:22<01:59, 6.27s/it] {'loss': 0.0187, 'grad_norm': 25.07172393798828, 'learning_rate': 0.001, 'ppl': 1.01888, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0752339363098145, 'tokens/total': 118976, 'tokens/trainable': 559, 'epoch': 13.0}
	41%\|████████████████████████████████████████████████████████ \| 13/32 [01:22<01:59, 6.27s/it] 44%\|████████████████████████████████████████████████████████████▍ \| 14/32 [01:28<01:52, 6.27s/it] {'loss': 0.0172, 'grad_norm': 24.219331741333008, 'learning_rate': 0.001, 'ppl': 1.01735, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.075183391571045, 'tokens/total': 128128, 'tokens/trainable': 602, 'epoch': 14.0}
	44%\|████████████████████████████████████████████████████████████▍ \| 14/32 [01:28<01:52, 6.27s/it] 47%\|████████████████████████████████████████████████████████████████▋ \| 15/32 [01:34<01:46, 6.28s/it] {'loss': 0.0166, 'grad_norm': 23.965293884277344, 'learning_rate': 0.001, 'ppl': 1.01674, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.073108673095703, 'tokens/total': 137280, 'tokens/trainable': 645, 'epoch': 15.0}
	47%\|████████████████████████████████████████████████████████████████▋ \| 15/32 [01:34<01:46, 6.28s/it] 50%\|█████████████████████████████████████████████████████████████████████ \| 16/32 [01:40<01:40, 6.28s/it] {'loss': 0.0139, 'grad_norm': 21.725933074951172, 'learning_rate': 0.001, 'ppl': 1.014, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.072548866271973, 'tokens/total': 146432, 'tokens/trainable': 688, 'epoch': 16.0}
	50%\|█████████████████████████████████████████████████████████████████████ \| 16/32 [01:41<01:40, 6.28s/it] 53%\|█████████████████████████████████████████████████████████████████████████▎ \| 17/32 [01:47<01:34, 6.28s/it] {'loss': 0.013, 'grad_norm': 19.918394088745117, 'learning_rate': 0.001, 'ppl': 1.01308, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070837497711182, 'tokens/total': 155584, 'tokens/trainable': 731, 'epoch': 17.0}
	53%\|█████████████████████████████████████████████████████████████████████████▎ \| 17/32 [01:47<01:34, 6.28s/it] 56%\|█████████████████████████████████████████████████████████████████████████████▋ \| 18/32 [01:53<01:28, 6.29s/it] {'loss': 0.0111, 'grad_norm': 16.317699432373047, 'learning_rate': 0.001, 'ppl': 1.01116, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.070370674133301, 'tokens/total': 164736, 'tokens/trainable': 774, 'epoch': 18.0}
	56%\|█████████████████████████████████████████████████████████████████████████████▋ \| 18/32 [01:53<01:28, 6.29s/it] 59%\|█████████████████████████████████████████████████████████████████████████████████▉ \| 19/32 [01:59<01:21, 6.29s/it] {'loss': 0.0105, 'grad_norm': 15.480484008789062, 'learning_rate': 0.001, 'ppl': 1.01056, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067584037780762, 'tokens/total': 173888, 'tokens/trainable': 817, 'epoch': 19.0}
	59%\|█████████████████████████████████████████████████████████████████████████████████▉ \| 19/32 [01:59<01:21, 6.29s/it] 62%\|██████████████████████████████████████████████████████████████████████████████████████▎ \| 20/32 [02:06<01:15, 6.29s/it] {'loss': 0.0092, 'grad_norm': 15.762852668762207, 'learning_rate': 0.001, 'ppl': 1.00924, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.068929195404053, 'tokens/total': 183040, 'tokens/trainable': 860, 'epoch': 20.0}
	62%\|██████████████████████████████████████████████████████████████████████████████████████▎ \| 20/32 [02:06<01:15, 6.29s/it] 66%\|██████████████████████████████████████████████████████████████████████████████████████████▌ \| 21/32 [02:12<01:09, 6.28s/it] {'loss': 0.0079, 'grad_norm': 11.2904691696167, 'learning_rate': 0.001, 'ppl': 1.00793, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069418907165527, 'tokens/total': 192192, 'tokens/trainable': 903, 'epoch': 21.0}
	66%\|██████████████████████████████████████████████████████████████████████████████████████████▌ \| 21/32 [02:12<01:09, 6.28s/it] 69%\|██████████████████████████████████████████████████████████████████████████████████████████████▉ \| 22/32 [02:18<01:02, 6.28s/it] {'loss': 0.0074, 'grad_norm': 10.677675247192383, 'learning_rate': 0.001, 'ppl': 1.00743, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0709052085876465, 'tokens/total': 201344, 'tokens/trainable': 946, 'epoch': 22.0}
	69%\|██████████████████████████████████████████████████████████████████████████████████████████████▉ \| 22/32 [02:18<01:02, 6.28s/it] 72%\|███████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 23/32 [02:24<00:56, 6.29s/it] {'loss': 0.0063, 'grad_norm': 8.554458618164062, 'learning_rate': 0.001, 'ppl': 1.00632, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.069887638092041, 'tokens/total': 210496, 'tokens/trainable': 989, 'epoch': 23.0}
	72%\|███████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 23/32 [02:25<00:56, 6.29s/it] 75%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 24/32 [02:31<00:50, 6.29s/it] {'loss': 0.0058, 'grad_norm': 7.792212009429932, 'learning_rate': 0.001, 'ppl': 1.00582, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.052520275115967, 'tokens/total': 219648, 'tokens/trainable': 1032, 'epoch': 24.0}
	75%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 24/32 [02:31<00:50, 6.29s/it] 78%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 25/32 [02:37<00:44, 6.29s/it] {'loss': 0.0047, 'grad_norm': 5.932632923126221, 'learning_rate': 0.001, 'ppl': 1.00471, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.059691429138184, 'tokens/total': 228800, 'tokens/trainable': 1075, 'epoch': 25.0}
	78%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 25/32 [02:37<00:44, 6.29s/it] 81%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 26/32 [02:43<00:37, 6.29s/it] {'loss': 0.0046, 'grad_norm': 5.608907699584961, 'learning_rate': 0.001, 'ppl': 1.00461, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0648603439331055, 'tokens/total': 237952, 'tokens/trainable': 1118, 'epoch': 26.0}
	81%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 26/32 [02:43<00:37, 6.29s/it] 84%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 27/32 [02:50<00:31, 6.29s/it] {'loss': 0.0043, 'grad_norm': 5.099766254425049, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.065439701080322, 'tokens/total': 247104, 'tokens/trainable': 1161, 'epoch': 27.0}
	84%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 27/32 [02:50<00:31, 6.29s/it] 88%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 28/32 [02:56<00:25, 6.29s/it] {'loss': 0.0043, 'grad_norm': 4.663393020629883, 'learning_rate': 0.001, 'ppl': 1.00431, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.063167095184326, 'tokens/total': 256256, 'tokens/trainable': 1204, 'epoch': 28.0}
	88%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 28/32 [02:56<00:25, 6.29s/it] 91%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 29/32 [03:02<00:18, 6.29s/it] {'loss': 0.0033, 'grad_norm': 3.509425163269043, 'learning_rate': 0.001, 'ppl': 1.00331, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.064444065093994, 'tokens/total': 265408, 'tokens/trainable': 1247, 'epoch': 29.0}
	91%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 29/32 [03:02<00:18, 6.29s/it] 94%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 30/32 [03:09<00:12, 6.29s/it] {'loss': 0.0034, 'grad_norm': 3.3978261947631836, 'learning_rate': 0.001, 'ppl': 1.00341, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.067255973815918, 'tokens/total': 274560, 'tokens/trainable': 1290, 'epoch': 30.0}
	94%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 30/32 [03:09<00:12, 6.29s/it] 97%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 31/32 [03:15<00:06, 6.29s/it] {'loss': 0.0035, 'grad_norm': 3.4551568031311035, 'learning_rate': 0.001, 'ppl': 1.00351, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.062354564666748, 'tokens/total': 283712, 'tokens/trainable': 1333, 'epoch': 31.0}
	97%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 31/32 [03:15<00:06, 6.29s/it] 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 32/32 [03:21<00:00, 6.29s/it] {'loss': 0.004, 'grad_norm': 4.433701515197754, 'learning_rate': 0.001, 'ppl': 1.00401, 'memory/max_active (GiB)': 20.46, 'memory/max_allocated (GiB)': 20.46, 'memory/device_reserved (GiB)': 23.4, 'tokens/train_per_sec_per_gpu': 7.0588459968566895, 'tokens/total': 292864, 'tokens/trainable': 1376, 'epoch': 32.0}
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 32/32 [03:21<00:00, 6.29s/it] {'train_runtime': 201.6905, 'train_samples_per_second': 2.063, 'train_steps_per_second': 0.159, 'train_loss': 0.01493466420652112, 'memory/max_active (GiB)': 9.29, 'memory/max_allocated (GiB)': 9.29, 'memory/device_reserved (GiB)': 23.4, 'epoch': 32.0, 'tokens/train_per_sec_per_gpu': 0.0}
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 32/32 [03:21<00:00, 6.29s/it] 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 32/32 [03:21<00:00, 6.30s/it]
	[2026-01-25 09:59:35,422] [INFO] [axolotl.train.save_trained_model:233] [PID:13320] Training completed! Saving trained model to ./model-out.
	[2026-01-25 09:59:48,526] [INFO] [axolotl.train.save_trained_model:351] [PID:13320] Model successfully saved to ./model-out