Upload folder using huggingface_hub

5270841 verified 18 days ago

112 kB

	[2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB ()
	[2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "qwen3-sft-stmt-tk.yml",
	"base_model": "Qwen/Qwen3-8B",
	"base_model_config": "Qwen/Qwen3-8B",
	"batch_size": 16,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_90",
	"fp8": true,
	"n_gpu": 8,
	"n_node": 1
	},
	"chat_template": "qwen3",
	"chat_template_kwargs": {
	"enable_thinking": false
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 8,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 192,
	"datasets": [
	{
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "xiaolesu/lean4-sft-stmt-tk",
	"split": "train",
	"trust_remote_code": false,
	"type": "alpaca"
	}
	],
	"ddp": true,
	"device": "cuda:0",
	"device_map": {
	"": 0
	},
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"eaft_alpha": 1.0,
	"eaft_k": 20,
	"env_capabilities": {
	"torch_version": "2.9.1"
	},
	"eval_batch_size": 2,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_sample_packing": true,
	"eval_table_size": 0,
	"evals_per_epoch": 10,
	"experimental_skip_move_to_device": true,
	"flex_attention": true,
	"flex_attn_compile_kwargs": {
	"dynamic": false,
	"mode": "max-autotune-no-cudagraphs"
	},
	"fp16": false,
	"fsdp": [
	"full_shard",
	"auto_wrap"
	],
	"fsdp_config": {
	"activation_checkpointing": true,
	"auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
	"cpu_ram_efficient_loading": true,
	"fsdp_version": 2,
	"offload_params": false,
	"reshard_after_forward": true,
	"state_dict_type": "FULL_STATE_DICT",
	"transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
	},
	"fsdp_version": 2,
	"generate_samples": false,
	"generation_do_sample": true,
	"generation_max_new_tokens": 50,
	"generation_prompt_ratio": 0.5,
	"generation_temperature": 0.7,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": false,
	"include_tkps": true,
	"learning_rate": 1e-05,
	"liger_fused_linear_cross_entropy": true,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 5,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"mean_resizing_embeddings": false,
	"micro_batch_size": 2,
	"model_config_type": "qwen3",
	"num_epochs": 2.0,
	"num_generation_samples": 3,
	"optimizer": "adamw_torch_fused",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./outputs/qwen3-sft-stmt-tk/",
	"pad_to_sequence_len": true,
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin"
	],
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"quantize_moe_experts": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": true,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_steps": 0.05,
	"save_total_limit": 3,
	"saves_per_epoch": 10,
	"sequence_len": 8192,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": true,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "Qwen/Qwen3-8B",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_otel_metrics": false,
	"use_ray": false,
	"use_wandb": true,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"wandb_name": "qwen3-8b-tk-run1",
	"wandb_project": "qwen3-sft-stmt-tk",
	"warmup_ratio": 0.1,
	"weight_decay": 0.0,
	"world_size": 8
	}
	[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <\|im_end\|>
	[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
	[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <\|endoftext\|>
	[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
	[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11
	[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets...
	[2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s]
	[2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None
	[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205
	[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159
	Dropping Invalid Sequences (<None or >8192) (num_proc=192): 0%\| \| 0/11192 [00:00<?, ? examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 1%\| \| 59/11192 [00:02<06:34, 28.25 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 3%\|▎ \| 295/11192 [00:02<01:02, 175.65 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 6%\|▌ \| 649/11192 [00:02<00:23, 453.06 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 8%\|▊ \| 885/11192 [00:02<00:16, 634.46 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 10%\|█ \| 1121/11192 [00:02<00:11, 849.04 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 13%\|█▎ \| 1416/11192 [00:02<00:08, 1166.00 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 15%\|█▌ \| 1711/11192 [00:02<00:06, 1480.17 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 18%\|█▊ \| 2006/11192 [00:02<00:05, 1697.58 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 21%\|██ \| 2301/11192 [00:02<00:04, 1949.74 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 23%\|██▎ \| 2596/11192 [00:03<00:04, 2145.10 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 26%\|██▌ \| 2891/11192 [00:03<00:03, 2324.57 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 29%\|██▉ \| 3245/11192 [00:03<00:03, 2566.75 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 70%\|██████▉ \| 7828/11192 [00:03<00:00, 14035.00 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 100%\|██████████\| 11192/11192 [00:04<00:00, 2753.84 examples/s]
	[2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192])
	Drop Samples with Zero Trainable Tokens (num_proc=192): 0%\| \| 0/10830 [00:00<?, ? examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 1%\| \| 57/10830 [00:02<06:27, 27.78 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 3%\|▎ \| 285/10830 [00:02<01:00, 173.64 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 4%\|▍ \| 456/10830 [00:02<00:34, 299.77 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 6%\|▋ \| 684/10830 [00:02<00:20, 506.62 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 8%\|▊ \| 912/10830 [00:02<00:13, 736.95 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 11%\|█ \| 1140/10830 [00:02<00:10, 947.17 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 13%\|█▎ \| 1368/10830 [00:02<00:08, 1094.03 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 15%\|█▍ \| 1596/10830 [00:02<00:07, 1269.49 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 17%\|█▋ \| 1824/10830 [00:02<00:06, 1437.65 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 19%\|█▉ \| 2052/10830 [00:03<00:05, 1614.63 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 21%\|██ \| 2280/10830 [00:03<00:05, 1635.72 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 23%\|██▎ \| 2508/10830 [00:03<00:04, 1732.21 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 25%\|██▌ \| 2736/10830 [00:03<00:04, 1721.60 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 27%\|██▋ \| 2964/10830 [00:03<00:04, 1703.27 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 29%\|██▉ \| 3192/10830 [00:03<00:04, 1798.77 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 32%\|███▏ \| 3477/10830 [00:03<00:03, 1958.86 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 34%\|███▍ \| 3705/10830 [00:03<00:03, 2037.08 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 36%\|███▋ \| 3933/10830 [00:04<00:03, 2067.96 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 38%\|███▊ \| 4161/10830 [00:04<00:03, 2091.19 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 41%\|████ \| 4389/10830 [00:04<00:05, 1127.36 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 43%\|████▎ \| 4670/10830 [00:04<00:04, 1385.39 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 45%\|████▌ \| 4894/10830 [00:04<00:04, 1432.10 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 69%\|██████▉ \| 7526/10830 [00:04<00:00, 6499.14 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 100%\|██████████\| 10830/10830 [00:05<00:00, 1931.57 examples/s]
	Add position_id column (Sample Packing) (num_proc=192): 0%\| \| 0/10830 [00:00<?, ? examples/s] Add position_id column (Sample Packing) (num_proc=192): 1%\| \| 57/10830 [00:02<06:33, 27.40 examples/s] Add position_id column (Sample Packing) (num_proc=192): 2%\|▏ \| 228/10830 [00:02<01:18, 135.14 examples/s] Add position_id column (Sample Packing) (num_proc=192): 4%\|▍ \| 456/10830 [00:02<00:33, 310.31 examples/s] Add position_id column (Sample Packing) (num_proc=192): 8%\|▊ \| 912/10830 [00:02<00:14, 692.10 examples/s] Add position_id column (Sample Packing) (num_proc=192): 11%\|█ \| 1140/10830 [00:02<00:11, 858.26 examples/s] Add position_id column (Sample Packing) (num_proc=192): 13%\|█▎ \| 1368/10830 [00:02<00:09, 1027.56 examples/s] Add position_id column (Sample Packing) (num_proc=192): 15%\|█▍ \| 1596/10830 [00:02<00:07, 1182.55 examples/s] Add position_id column (Sample Packing) (num_proc=192): 17%\|█▋ \| 1881/10830 [00:02<00:06, 1425.26 examples/s] Add position_id column (Sample Packing) (num_proc=192): 20%\|██ \| 2166/10830 [00:03<00:05, 1604.97 examples/s] Add position_id column (Sample Packing) (num_proc=192): 22%\|██▏ \| 2394/10830 [00:03<00:04, 1738.29 examples/s] Add position_id column (Sample Packing) (num_proc=192): 25%\|██▍ \| 2679/10830 [00:03<00:04, 1951.23 examples/s] Add position_id column (Sample Packing) (num_proc=192): 63%\|██████▎ \| 6854/10830 [00:03<00:00, 11681.66 examples/s] Add position_id column (Sample Packing) (num_proc=192): 100%\|██████████\| 10830/10830 [00:04<00:00, 2621.72 examples/s]
	Saving the dataset (0/42 shards): 0%\| \| 0/10830 [00:00<?, ? examples/s] Saving the dataset (0/42 shards): 2%\|▏ \| 258/10830 [00:00<00:22, 464.02 examples/s] Saving the dataset (1/42 shards): 2%\|▏ \| 258/10830 [00:00<00:22, 464.02 examples/s] Saving the dataset (2/42 shards): 7%\|▋ \| 774/10830 [00:00<00:21, 464.02 examples/s] Saving the dataset (3/42 shards): 7%\|▋ \| 774/10830 [00:00<00:21, 464.02 examples/s] Saving the dataset (4/42 shards): 14%\|█▍ \| 1548/10830 [00:00<00:20, 464.02 examples/s] Saving the dataset (5/42 shards): 14%\|█▍ \| 1548/10830 [00:00<00:20, 464.02 examples/s] Saving the dataset (6/42 shards): 17%\|█▋ \| 1806/10830 [00:00<00:19, 464.02 examples/s] Saving the dataset (7/42 shards): 19%\|█▉ \| 2064/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (8/42 shards): 21%\|██▏ \| 2322/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (9/42 shards): 21%\|██▏ \| 2322/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (10/42 shards): 26%\|██▌ \| 2838/10830 [00:00<00:17, 464.02 examples/s] Saving the dataset (11/42 shards): 29%\|██▊ \| 3096/10830 [00:00<00:16, 464.02 examples/s] Saving the dataset (12/42 shards): 31%\|███ \| 3354/10830 [00:00<00:16, 464.02 examples/s] Saving the dataset (13/42 shards): 33%\|███▎ \| 3612/10830 [00:00<00:15, 464.02 examples/s] Saving the dataset (14/42 shards): 33%\|███▎ \| 3612/10830 [00:00<00:15, 464.02 examples/s] Saving the dataset (15/42 shards): 38%\|███▊ \| 4128/10830 [00:00<00:14, 464.02 examples/s] Saving the dataset (16/42 shards): 40%\|████ \| 4386/10830 [00:00<00:13, 464.02 examples/s] Saving the dataset (17/42 shards): 40%\|████ \| 4386/10830 [00:00<00:13, 464.02 examples/s] Saving the dataset (18/42 shards): 45%\|████▌ \| 4902/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (19/42 shards): 48%\|████▊ \| 5160/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (20/42 shards): 48%\|████▊ \| 5160/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (21/42 shards): 52%\|█████▏ \| 5676/10830 [00:00<00:11, 464.02 examples/s] Saving the dataset (22/42 shards): 52%\|█████▏ \| 5676/10830 [00:00<00:11, 464.02 examples/s] Saving the dataset (23/42 shards): 55%\|█████▍ \| 5934/10830 [00:00<00:10, 464.02 examples/s] Saving the dataset (24/42 shards): 57%\|█████▋ \| 6192/10830 [00:00<00:09, 464.02 examples/s] Saving the dataset (25/42 shards): 64%\|██████▍ \| 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (26/42 shards): 64%\|██████▍ \| 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (27/42 shards): 64%\|██████▍ \| 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (28/42 shards): 67%\|██████▋ \| 7224/10830 [00:00<00:07, 464.02 examples/s] Saving the dataset (29/42 shards): 74%\|███████▍ \| 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (30/42 shards): 74%\|███████▍ \| 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (31/42 shards): 74%\|███████▍ \| 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (32/42 shards): 79%\|███████▊ \| 8514/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (33/42 shards): 81%\|████████ \| 8772/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (34/42 shards): 81%\|████████ \| 8772/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (35/42 shards): 83%\|████████▎ \| 9030/10830 [00:00<00:03, 464.02 examples/s] Saving the dataset (36/42 shards): 88%\|████████▊ \| 9545/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (37/42 shards): 88%\|████████▊ \| 9545/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (38/42 shards): 91%\|█████████ \| 9802/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (39/42 shards): 95%\|█████████▌\| 10316/10830 [00:00<00:01, 464.02 examples/s] Saving the dataset (40/42 shards): 95%\|█████████▌\| 10316/10830 [00:00<00:01, 464.02 examples/s] Saving the dataset (41/42 shards): 98%\|█████████▊\| 10573/10830 [00:00<00:00, 464.02 examples/s] Saving the dataset (42/42 shards): 100%\|██████████\| 10830/10830 [00:00<00:00, 464.02 examples/s] Saving the dataset (42/42 shards): 100%\|██████████\| 10830/10830 [00:00<00:00, 16314.56 examples/s]
	[2026-03-16 19:08:54,045] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:213] total_num_tokens: 33_957_071
	[2026-03-16 19:08:54,340] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:213] `total_supervised_tokens: 32_028_150`
	[2026-03-16 19:08:55,893] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.7050187587738037
	[2026-03-16 19:11:05,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2148, 2146, 2148, 2145, 2146, 2146, 2148, 2145]
	[2026-03-16 19:11:06,172] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:213] data_loader_len: 268
	[2026-03-16 19:11:06,189] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:213] sample_packing_eff_est across ranks: [0.9646614789962769, 0.9657852649688721, 0.9646614789962769, 0.9657852649688721, 0.9648860096931458, 0.9648860096931458, 0.9653354287147522, 0.9657852649688721]
	[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:213] sample_packing_eff_est: 0.97
	[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:213] total_num_steps: 536
	[2026-03-16 19:11:06,192] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:213] Maximum number of steps set at 536
	[2026-03-16 19:11:06,242] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:213] loading tokenizer... Qwen/Qwen3-8B
	[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <\|im_end\|>
	[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
	[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <\|endoftext\|>
	[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
	[2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model
	[2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing...
	[2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
	Loading weights: 0%\| \| 0/399 [00:00<?, ?it/s] Loading weights: 100%\|██████████\| 399/399 [00:00<00:00, 9671.84it/s]
	[2026-03-16 19:11:09,882] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:359] [PID:213] Converting modules to torch.bfloat16
	[2026-03-16 19:11:09,885] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:213] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
	[2026-03-16 19:11:11,696] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:213] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.Multiple deprecation warnings due to FSDP2 conversion:
	sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
	[2026-03-16 19:11:12,192] [INFO] [axolotl.train.save_initial_configs:417] [PID:213] Pre-saving tokenizer to ./outputs/qwen3-sft-stmt-tk/...
	[2026-03-16 19:11:12,283] [INFO] [axolotl.train.save_initial_configs:422] [PID:213] Pre-saving model config to ./outputs/qwen3-sft-stmt-tk/...
	[2026-03-16 19:11:12,286] [INFO] [axolotl.train.execute_training:218] [PID:213] Starting trainer...
	[2026-03-16 19:11:14,793] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.9547648429870605
	[2026-03-16 19:11:14,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2103, 2104, 2104, 2104, 2103, 2104, 2106, 2104]
	[2026-03-16 19:11:15,013] [INFO] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:34] [PID:213] Broadcasting full state dict to all ranks...
	[2026-03-16 19:11:22,269] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:86] [PID:213] Time taken to load full state dict: 7.26 seconds
	[2026-03-16 19:11:22,270] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.log_gpu_memory_usage:127] [PID:213] Memory usage after broadcasting full state dict 3.067GB (+3.067GB allocated, +3.178GB reserved)
	wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
	wandb: Currently logged in as: suxiaole0223 (suxiaole) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
	wandb: setting up run kje10pck
	wandb: Tracking run with wandb version 0.25.1
	wandb: Run data is saved locally in /workspace/axolotl-workspace/wandb/run-20260316_191122-kje10pck
	wandb: Run `wandb offline` to turn off syncing.
	wandb: Syncing run qwen3-8b-tk-run1
	wandb: ⭐️ View project at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk
	wandb: 🚀 View run at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk/runs/kje10pck
	wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
	wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
	[2026-03-16 19:11:25,554] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:213] The Axolotl config has been saved to the WandB run under files.
	0%\| \| 0/536 [00:00<?, ?it/s][2026-03-16 19:11:57,210] [WARNING] [py.warnings._showwarnmsg:110] [PID:213] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
	_warn_once(

	0%\| \| 1/536 [00:40<6:03:21, 40.75s/it] 0%\| \| 2/536 [00:43<2:42:00, 18.20s/it] 1%\| \| 3/536 [00:45<1:37:15, 10.95s/it] 1%\| \| 4/536 [00:47<1:07:23, 7.60s/it] 1%\| \| 5/536 [00:50<50:28, 5.70s/it] {'loss': '0.8667', 'grad_norm': '2.609', 'learning_rate': '7.547e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6531', 'tokens/total': 655360, 'tokens/trainable': 611049, 'epoch': '0.01908'}
	1%\| \| 5/536 [00:50<50:28, 5.70s/it] 1%\| \| 6/536 [00:52<40:15, 4.56s/it] 1%\|▏ \| 7/536 [00:55<34:02, 3.86s/it] 1%\|▏ \| 8/536 [00:57<30:00, 3.41s/it] 2%\|▏ \| 9/536 [00:59<26:45, 3.05s/it] 2%\|▏ \| 10/536 [01:02<24:45, 2.82s/it] {'loss': '0.8307', 'grad_norm': '2.5', 'learning_rate': '1.698e-06', 'ppl': '2.295', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6647', 'tokens/total': 1310720, 'tokens/trainable': 1224548, 'epoch': '0.03817'}
	2%\|▏ \| 10/536 [01:02<24:45, 2.82s/it] 2%\|▏ \| 11/536 [01:04<23:13, 2.65s/it] 2%\|▏ \| 12/536 [01:06<22:04, 2.53s/it] 2%\|▏ \| 13/536 [01:08<21:32, 2.47s/it] 3%\|▎ \| 14/536 [01:11<21:27, 2.47s/it] 3%\|▎ \| 15/536 [01:13<21:28, 2.47s/it] {'loss': '0.8487', 'grad_norm': '2.453', 'learning_rate': '2.642e-06', 'ppl': '2.337', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6160', 'tokens/total': 1966080, 'tokens/trainable': 1834432, 'epoch': '0.05725'}
	3%\|▎ \| 15/536 [01:13<21:28, 2.47s/it] 3%\|▎ \| 16/536 [01:16<21:18, 2.46s/it] 3%\|▎ \| 17/536 [01:18<20:51, 2.41s/it] 3%\|▎ \| 18/536 [01:20<20:44, 2.40s/it] 4%\|▎ \| 19/536 [01:23<21:59, 2.55s/it] 4%\|▎ \| 20/536 [01:26<21:40, 2.52s/it] {'loss': '0.7713', 'grad_norm': '1.898', 'learning_rate': '3.585e-06', 'ppl': '2.163', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6256', 'tokens/total': 2621440, 'tokens/trainable': 2448388, 'epoch': '0.07634'}
	4%\|▎ \| 20/536 [01:26<21:40, 2.52s/it] 4%\|▍ \| 21/536 [01:28<21:23, 2.49s/it] 4%\|▍ \| 22/536 [01:31<20:49, 2.43s/it] 4%\|▍ \| 23/536 [01:33<20:37, 2.41s/it] 4%\|▍ \| 24/536 [01:35<20:37, 2.42s/it] 5%\|▍ \| 25/536 [01:38<20:01, 2.35s/it] {'loss': '0.7452', 'grad_norm': '1.273', 'learning_rate': '4.528e-06', 'ppl': '2.107', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6954', 'tokens/total': 3276800, 'tokens/trainable': 3060985, 'epoch': '0.09542'}
	5%\|▍ \| 25/536 [01:38<20:01, 2.35s/it] 5%\|▍ \| 26/536 [01:40<19:41, 2.32s/it] 5%\|▌ \| 27/536 [01:42<19:26, 2.29s/it][2026-03-16 19:13:17,483] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-27

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.48s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.48s/it]
	5%\|▌ \| 28/536 [03:16<4:11:47, 29.74s/it] 5%\|▌ \| 29/536 [03:18<3:01:39, 21.50s/it] 6%\|▌ \| 30/536 [03:20<2:12:38, 15.73s/it] {'loss': '0.718', 'grad_norm': '0.7695', 'learning_rate': '5.472e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6706', 'tokens/total': 3932160, 'tokens/trainable': 3670695, 'epoch': '0.1145'}
	6%\|▌ \| 30/536 [03:20<2:12:38, 15.73s/it] 6%\|▌ \| 31/536 [03:23<1:38:27, 11.70s/it] 6%\|▌ \| 32/536 [03:25<1:14:38, 8.89s/it] 6%\|▌ \| 33/536 [03:27<57:48, 6.90s/it] 6%\|▋ \| 34/536 [03:29<46:07, 5.51s/it] 7%\|▋ \| 35/536 [03:32<37:56, 4.54s/it] {'loss': '0.6699', 'grad_norm': '0.6406', 'learning_rate': '6.415e-06', 'ppl': '1.954', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6770', 'tokens/total': 4587520, 'tokens/trainable': 4284736, 'epoch': '0.1336'}
	7%\|▋ \| 35/536 [03:32<37:56, 4.54s/it] 7%\|▋ \| 36/536 [03:34<32:19, 3.88s/it] 7%\|▋ \| 37/536 [03:37<28:45, 3.46s/it] 7%\|▋ \| 38/536 [03:39<26:05, 3.14s/it] 7%\|▋ \| 39/536 [03:41<24:10, 2.92s/it] 7%\|▋ \| 40/536 [03:44<22:31, 2.72s/it] {'loss': '0.6393', 'grad_norm': '0.418', 'learning_rate': '7.358e-06', 'ppl': '1.895', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6668', 'tokens/total': 5242880, 'tokens/trainable': 4896504, 'epoch': '0.1527'}
	7%\|▋ \| 40/536 [03:44<22:31, 2.72s/it] 8%\|▊ \| 41/536 [03:46<21:24, 2.59s/it] 8%\|▊ \| 42/536 [03:48<20:36, 2.50s/it] 8%\|▊ \| 43/536 [03:51<20:06, 2.45s/it] 8%\|▊ \| 44/536 [03:53<19:38, 2.39s/it] 8%\|▊ \| 45/536 [03:55<19:17, 2.36s/it] {'loss': '0.5953', 'grad_norm': '0.3594', 'learning_rate': '8.302e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 5898240, 'tokens/trainable': 5505933, 'epoch': '0.1718'}
	8%\|▊ \| 45/536 [03:55<19:17, 2.36s/it] 9%\|▊ \| 46/536 [03:57<19:17, 2.36s/it] 9%\|▉ \| 47/536 [04:00<19:01, 2.33s/it] 9%\|▉ \| 48/536 [04:02<19:02, 2.34s/it] 9%\|▉ \| 49/536 [04:04<19:02, 2.35s/it] 9%\|▉ \| 50/536 [04:07<18:55, 2.34s/it] {'loss': '0.5779', 'grad_norm': '0.332', 'learning_rate': '9.245e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6574', 'tokens/total': 6553600, 'tokens/trainable': 6116643, 'epoch': '0.1908'}
	9%\|▉ \| 50/536 [04:07<18:55, 2.34s/it] 10%\|▉ \| 51/536 [04:09<18:46, 2.32s/it] 10%\|▉ \| 52/536 [04:11<18:33, 2.30s/it] 10%\|▉ \| 53/536 [04:14<18:19, 2.28s/it] 10%\|█ \| 54/536 [04:16<18:23, 2.29s/it][2026-03-16 19:15:50,860] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-54

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.65s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.65s/it]
	10%\|█ \| 55/536 [05:48<3:55:25, 29.37s/it] {'loss': '0.5579', 'grad_norm': '0.2793', 'learning_rate': '1e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4847', 'tokens/total': 7208960, 'tokens/trainable': 6728061, 'epoch': '0.2099'}
	10%\|█ \| 55/536 [05:48<3:55:25, 29.37s/it] 10%\|█ \| 56/536 [05:51<2:50:56, 21.37s/it] 11%\|█ \| 57/536 [05:54<2:05:27, 15.72s/it] 11%\|█ \| 58/536 [05:56<1:33:09, 11.69s/it] 11%\|█ \| 59/536 [05:58<1:11:13, 8.96s/it] 11%\|█ \| 60/536 [06:01<55:01, 6.94s/it] {'loss': '0.5485', 'grad_norm': '0.2773', 'learning_rate': '9.996e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 7864320, 'tokens/trainable': 7336524, 'epoch': '0.229'}
	11%\|█ \| 60/536 [06:01<55:01, 6.94s/it] 11%\|█▏ \| 61/536 [06:03<43:45, 5.53s/it] 12%\|█▏ \| 62/536 [06:05<36:05, 4.57s/it] 12%\|█▏ \| 63/536 [06:08<30:31, 3.87s/it] 12%\|█▏ \| 64/536 [06:10<26:38, 3.39s/it] 12%\|█▏ \| 65/536 [06:12<24:01, 3.06s/it] {'loss': '0.5385', 'grad_norm': '0.2734', 'learning_rate': '9.987e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6565', 'tokens/total': 8519680, 'tokens/trainable': 7944984, 'epoch': '0.2481'}
	12%\|█▏ \| 65/536 [06:12<24:01, 3.06s/it] 12%\|█▏ \| 66/536 [06:14<22:09, 2.83s/it] 12%\|█▎ \| 67/536 [06:17<21:06, 2.70s/it] 13%\|█▎ \| 68/536 [06:19<20:07, 2.58s/it] 13%\|█▎ \| 69/536 [06:21<19:21, 2.49s/it] 13%\|█▎ \| 70/536 [06:24<19:00, 2.45s/it] {'loss': '0.5197', 'grad_norm': '0.2578', 'learning_rate': '9.973e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 9175040, 'tokens/trainable': 8556200, 'epoch': '0.2672'}
	13%\|█▎ \| 70/536 [06:24<19:00, 2.45s/it] 13%\|█▎ \| 71/536 [06:26<18:33, 2.39s/it] 13%\|█▎ \| 72/536 [06:28<18:14, 2.36s/it] 14%\|█▎ \| 73/536 [06:31<18:01, 2.34s/it] 14%\|█▍ \| 74/536 [06:33<18:02, 2.34s/it] 14%\|█▍ \| 75/536 [06:35<17:54, 2.33s/it] {'loss': '0.5316', 'grad_norm': '0.3008', 'learning_rate': '9.953e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6728', 'tokens/total': 9830400, 'tokens/trainable': 9167282, 'epoch': '0.2863'}
	14%\|█▍ \| 75/536 [06:35<17:54, 2.33s/it] 14%\|█▍ \| 76/536 [06:38<17:54, 2.34s/it] 14%\|█▍ \| 77/536 [06:40<18:05, 2.37s/it] 15%\|█▍ \| 78/536 [06:43<18:28, 2.42s/it] 15%\|█▍ \| 79/536 [06:45<18:05, 2.37s/it] 15%\|█▍ \| 80/536 [06:47<17:47, 2.34s/it] {'loss': '0.5154', 'grad_norm': '0.3164', 'learning_rate': '9.929e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6730', 'tokens/total': 10485760, 'tokens/trainable': 9774908, 'epoch': '0.3053'}
	15%\|█▍ \| 80/536 [06:47<17:47, 2.34s/it] 15%\|█▌ \| 81/536 [06:49<17:39, 2.33s/it][2026-03-16 19:18:24,375] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-81

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.37s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.37s/it]
	15%\|█▌ \| 82/536 [08:22<3:43:29, 29.54s/it] 15%\|█▌ \| 83/536 [08:25<2:41:12, 21.35s/it] 16%\|█▌ \| 84/536 [08:27<1:57:43, 15.63s/it] 16%\|█▌ \| 85/536 [08:29<1:27:29, 11.64s/it] {'loss': '0.5143', 'grad_norm': '0.2363', 'learning_rate': '9.899e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6604', 'tokens/total': 11141120, 'tokens/trainable': 10388109, 'epoch': '0.3244'}
	16%\|█▌ \| 85/536 [08:29<1:27:29, 11.64s/it] 16%\|█▌ \| 86/536 [08:32<1:06:16, 8.84s/it] 16%\|█▌ \| 87/536 [08:34<51:19, 6.86s/it] 16%\|█▋ \| 88/536 [08:36<40:55, 5.48s/it] 17%\|█▋ \| 89/536 [08:38<33:37, 4.51s/it] 17%\|█▋ \| 90/536 [08:41<28:43, 3.86s/it] {'loss': '0.4957', 'grad_norm': '0.2412', 'learning_rate': '9.864e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 11796480, 'tokens/trainable': 10999678, 'epoch': '0.3435'}
	17%\|█▋ \| 90/536 [08:41<28:43, 3.86s/it] 17%\|█▋ \| 91/536 [08:43<25:17, 3.41s/it] 17%\|█▋ \| 92/536 [08:45<22:49, 3.08s/it] 17%\|█▋ \| 93/536 [08:48<21:12, 2.87s/it] 18%\|█▊ \| 94/536 [08:50<19:44, 2.68s/it] 18%\|█▊ \| 95/536 [08:52<19:22, 2.64s/it] {'loss': '0.509', 'grad_norm': '0.2236', 'learning_rate': '9.823e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5974', 'tokens/total': 12451840, 'tokens/trainable': 11609345, 'epoch': '0.3626'}
	18%\|█▊ \| 95/536 [08:52<19:22, 2.64s/it] 18%\|█▊ \| 96/536 [08:55<18:35, 2.54s/it] 18%\|█▊ \| 97/536 [08:57<18:01, 2.46s/it] 18%\|█▊ \| 98/536 [09:00<19:09, 2.62s/it] 18%\|█▊ \| 99/536 [09:03<19:00, 2.61s/it] 19%\|█▊ \| 100/536 [09:05<18:16, 2.51s/it] {'loss': '0.4925', 'grad_norm': '0.2451', 'learning_rate': '9.778e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6696', 'tokens/total': 13107200, 'tokens/trainable': 12218448, 'epoch': '0.3817'}
	19%\|█▊ \| 100/536 [09:05<18:16, 2.51s/it] 19%\|█▉ \| 101/536 [09:07<17:50, 2.46s/it] 19%\|█▉ \| 102/536 [09:09<17:19, 2.40s/it] 19%\|█▉ \| 103/536 [09:12<16:59, 2.35s/it] 19%\|█▉ \| 104/536 [09:14<16:47, 2.33s/it] 20%\|█▉ \| 105/536 [09:16<16:34, 2.31s/it] {'loss': '0.5051', 'grad_norm': '0.25', 'learning_rate': '9.727e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6724', 'tokens/total': 13762560, 'tokens/trainable': 12826468, 'epoch': '0.4008'}
	20%\|█▉ \| 105/536 [09:16<16:34, 2.31s/it] 20%\|█▉ \| 106/536 [09:19<16:28, 2.30s/it] 20%\|█▉ \| 107/536 [09:21<16:26, 2.30s/it] 20%\|██ \| 108/536 [09:23<16:27, 2.31s/it][2026-03-16 19:20:58,221] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-108

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.13s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.13s/it]
	20%\|██ \| 109/536 [11:03<3:44:22, 31.53s/it] 21%\|██ \| 110/536 [11:05<2:41:43, 22.78s/it] {'loss': '0.4725', 'grad_norm': '0.2266', 'learning_rate': '9.672e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6506', 'tokens/total': 14417920, 'tokens/trainable': 13440042, 'epoch': '0.4198'}
	21%\|██ \| 110/536 [11:05<2:41:43, 22.78s/it] 21%\|██ \| 111/536 [11:07<1:57:41, 16.61s/it] 21%\|██ \| 112/536 [11:10<1:26:54, 12.30s/it] 21%\|██ \| 113/536 [11:12<1:05:42, 9.32s/it] 21%\|██▏ \| 114/536 [11:14<50:52, 7.23s/it] 21%\|██▏ \| 115/536 [11:17<40:26, 5.76s/it] {'loss': '0.5004', 'grad_norm': '0.2256', 'learning_rate': '9.612e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6525', 'tokens/total': 15073280, 'tokens/trainable': 14049913, 'epoch': '0.4389'}
	21%\|██▏ \| 115/536 [11:17<40:26, 5.76s/it] 22%\|██▏ \| 116/536 [11:19<33:04, 4.72s/it] 22%\|██▏ \| 117/536 [11:22<28:43, 4.11s/it] 22%\|██▏ \| 118/536 [11:24<24:56, 3.58s/it] 22%\|██▏ \| 119/536 [11:26<22:09, 3.19s/it] 22%\|██▏ \| 120/536 [11:29<20:22, 2.94s/it] {'loss': '0.4727', 'grad_norm': '0.248', 'learning_rate': '9.546e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6422', 'tokens/total': 15728640, 'tokens/trainable': 14657396, 'epoch': '0.458'}
	22%\|██▏ \| 120/536 [11:29<20:22, 2.94s/it] 23%\|██▎ \| 121/536 [11:31<19:02, 2.75s/it] 23%\|██▎ \| 122/536 [11:33<18:01, 2.61s/it] 23%\|██▎ \| 123/536 [11:36<17:25, 2.53s/it] 23%\|██▎ \| 124/536 [11:38<17:02, 2.48s/it] 23%\|██▎ \| 125/536 [11:40<16:31, 2.41s/it] {'loss': '0.4808', 'grad_norm': '0.2344', 'learning_rate': '9.476e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6830', 'tokens/total': 16384000, 'tokens/trainable': 15266794, 'epoch': '0.4771'}
	23%\|██▎ \| 125/536 [11:40<16:31, 2.41s/it] 24%\|██▎ \| 126/536 [11:43<16:21, 2.39s/it] 24%\|██▎ \| 127/536 [11:45<16:21, 2.40s/it] 24%\|██▍ \| 128/536 [11:47<16:06, 2.37s/it] 24%\|██▍ \| 129/536 [11:50<15:59, 2.36s/it] 24%\|██▍ \| 130/536 [11:52<15:57, 2.36s/it] {'loss': '0.4726', 'grad_norm': '0.2451', 'learning_rate': '9.401e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 17039360, 'tokens/trainable': 15876387, 'epoch': '0.4962'}
	24%\|██▍ \| 130/536 [11:52<15:57, 2.36s/it] 24%\|██▍ \| 131/536 [11:54<15:48, 2.34s/it] 25%\|██▍ \| 132/536 [11:57<15:37, 2.32s/it] 25%\|██▍ \| 133/536 [11:59<15:31, 2.31s/it] 25%\|██▌ \| 134/536 [12:01<15:54, 2.37s/it] 25%\|██▌ \| 135/536 [12:04<16:09, 2.42s/it] {'loss': '0.4864', 'grad_norm': '0.2344', 'learning_rate': '9.322e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6056', 'tokens/total': 17694720, 'tokens/trainable': 16486440, 'epoch': '0.5153'}
	25%\|██▌ \| 135/536 [12:04<16:09, 2.42s/it][2026-03-16 19:23:38,988] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-135

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.41s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.41s/it]
	25%\|██▌ \| 136/536 [13:41<3:26:11, 30.93s/it] 26%\|██▌ \| 137/536 [13:45<2:30:52, 22.69s/it] 26%\|██▌ \| 138/536 [13:47<1:49:51, 16.56s/it] 26%\|██▌ \| 139/536 [13:49<1:21:09, 12.27s/it] 26%\|██▌ \| 140/536 [13:52<1:01:08, 9.26s/it] {'loss': '0.4817', 'grad_norm': '0.2275', 'learning_rate': '9.238e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6712', 'tokens/total': 18350080, 'tokens/trainable': 17095060, 'epoch': '0.5344'}
	26%\|██▌ \| 140/536 [13:52<1:01:08, 9.26s/it] 26%\|██▋ \| 141/536 [13:54<47:09, 7.16s/it] 26%\|██▋ \| 142/536 [13:56<37:27, 5.70s/it] 27%\|██▋ \| 143/536 [13:58<30:36, 4.67s/it] 27%\|██▋ \| 144/536 [14:01<25:57, 3.97s/it] 27%\|██▋ \| 145/536 [14:03<22:36, 3.47s/it] {'loss': '0.4827', 'grad_norm': '0.249', 'learning_rate': '9.149e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6633', 'tokens/total': 19005440, 'tokens/trainable': 17703368, 'epoch': '0.5534'}
	27%\|██▋ \| 145/536 [14:03<22:36, 3.47s/it] 27%\|██▋ \| 146/536 [14:05<20:19, 3.13s/it] 27%\|██▋ \| 147/536 [14:08<18:42, 2.89s/it] 28%\|██▊ \| 148/536 [14:10<17:44, 2.74s/it] 28%\|██▊ \| 149/536 [14:12<16:47, 2.60s/it] 28%\|██▊ \| 150/536 [14:15<16:15, 2.53s/it] {'loss': '0.4892', 'grad_norm': '0.2217', 'learning_rate': '9.057e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6466', 'tokens/total': 19660800, 'tokens/trainable': 18311084, 'epoch': '0.5725'}
	28%\|██▊ \| 150/536 [14:15<16:15, 2.53s/it] 28%\|██▊ \| 151/536 [14:17<15:49, 2.47s/it] 28%\|██▊ \| 152/536 [14:20<15:51, 2.48s/it] 29%\|██▊ \| 153/536 [14:22<16:18, 2.55s/it] 29%\|██▊ \| 154/536 [14:25<16:18, 2.56s/it] 29%\|██▉ \| 155/536 [14:27<16:04, 2.53s/it] {'loss': '0.4618', 'grad_norm': '0.2236', 'learning_rate': '8.959e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6104', 'tokens/total': 20316160, 'tokens/trainable': 18920000, 'epoch': '0.5916'}
	29%\|██▉ \| 155/536 [14:27<16:04, 2.53s/it] 29%\|██▉ \| 156/536 [14:30<15:36, 2.47s/it] 29%\|██▉ \| 157/536 [14:32<15:16, 2.42s/it] 29%\|██▉ \| 158/536 [14:35<15:24, 2.45s/it] 30%\|██▉ \| 159/536 [14:37<15:05, 2.40s/it] 30%\|██▉ \| 160/536 [14:39<14:54, 2.38s/it] {'loss': '0.471', 'grad_norm': '0.2793', 'learning_rate': '8.858e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6516', 'tokens/total': 20971520, 'tokens/trainable': 19529720, 'epoch': '0.6107'}
	30%\|██▉ \| 160/536 [14:39<14:54, 2.38s/it] 30%\|███ \| 161/536 [14:41<14:48, 2.37s/it] 30%\|███ \| 162/536 [14:44<14:28, 2.32s/it][2026-03-16 19:26:18,649] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-162

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.63s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.63s/it]
	30%\|███ \| 163/536 [16:21<3:11:06, 30.74s/it] 31%\|███ \| 164/536 [16:23<2:17:38, 22.20s/it] 31%\|███ \| 165/536 [16:25<1:40:18, 16.22s/it] {'loss': '0.4703', 'grad_norm': '0.2383', 'learning_rate': '8.752e-06', 'ppl': '1.6', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6785', 'tokens/total': 21626880, 'tokens/trainable': 20137712, 'epoch': '0.6298'}
	31%\|███ \| 165/536 [16:25<1:40:18, 16.22s/it] 31%\|███ \| 166/536 [16:28<1:14:13, 12.04s/it] 31%\|███ \| 167/536 [16:30<56:04, 9.12s/it] 31%\|███▏ \| 168/536 [16:32<43:22, 7.07s/it] 32%\|███▏ \| 169/536 [16:34<34:28, 5.64s/it] 32%\|███▏ \| 170/536 [16:37<28:13, 4.63s/it] {'loss': '0.4727', 'grad_norm': '0.2139', 'learning_rate': '8.643e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6694', 'tokens/total': 22282240, 'tokens/trainable': 20749040, 'epoch': '0.6489'}
	32%\|███▏ \| 170/536 [16:37<28:13, 4.63s/it] 32%\|███▏ \| 171/536 [16:39<24:07, 3.96s/it] 32%\|███▏ \| 172/536 [16:42<21:19, 3.51s/it] 32%\|███▏ \| 173/536 [16:44<19:34, 3.24s/it] 32%\|███▏ \| 174/536 [16:46<17:46, 2.95s/it] 33%\|███▎ \| 175/536 [16:49<16:24, 2.73s/it] {'loss': '0.4856', 'grad_norm': '0.2119', 'learning_rate': '8.53e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6812', 'tokens/total': 22937600, 'tokens/trainable': 21358216, 'epoch': '0.6679'}
	33%\|███▎ \| 175/536 [16:49<16:24, 2.73s/it] 33%\|███▎ \| 176/536 [16:51<15:32, 2.59s/it] 33%\|███▎ \| 177/536 [16:54<16:38, 2.78s/it] 33%\|███▎ \| 178/536 [16:57<15:52, 2.66s/it] 33%\|███▎ \| 179/536 [16:59<15:07, 2.54s/it] 34%\|███▎ \| 180/536 [17:01<14:47, 2.49s/it] {'loss': '0.4551', 'grad_norm': '0.2266', 'learning_rate': '8.413e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6375', 'tokens/total': 23592960, 'tokens/trainable': 21963408, 'epoch': '0.687'}
	34%\|███▎ \| 180/536 [17:01<14:47, 2.49s/it] 34%\|███▍ \| 181/536 [17:04<14:25, 2.44s/it] 34%\|███▍ \| 182/536 [17:06<14:08, 2.40s/it] 34%\|███▍ \| 183/536 [17:08<13:47, 2.34s/it] 34%\|███▍ \| 184/536 [17:10<13:42, 2.34s/it] 35%\|███▍ \| 185/536 [17:13<13:34, 2.32s/it] {'loss': '0.4654', 'grad_norm': '0.2695', 'learning_rate': '8.292e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6688', 'tokens/total': 24248320, 'tokens/trainable': 22570984, 'epoch': '0.7061'}
	35%\|███▍ \| 185/536 [17:13<13:34, 2.32s/it] 35%\|███▍ \| 186/536 [17:15<13:21, 2.29s/it] 35%\|███▍ \| 187/536 [17:17<13:19, 2.29s/it] 35%\|███▌ \| 188/536 [17:19<13:11, 2.28s/it] 35%\|███▌ \| 189/536 [17:22<13:08, 2.27s/it][2026-03-16 19:28:56,617] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-189

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.04s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.04s/it]
	35%\|███▌ \| 190/536 [19:01<3:00:49, 31.36s/it] {'loss': '0.4727', 'grad_norm': '0.2285', 'learning_rate': '8.168e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4795', 'tokens/total': 24903680, 'tokens/trainable': 23180680, 'epoch': '0.7252'}
	35%\|███▌ \| 190/536 [19:01<3:00:49, 31.36s/it] 36%\|███▌ \| 191/536 [19:03<2:10:15, 22.65s/it] 36%\|███▌ \| 192/536 [19:06<1:35:23, 16.64s/it] 36%\|███▌ \| 193/536 [19:08<1:11:08, 12.44s/it] 36%\|███▌ \| 194/536 [19:11<53:42, 9.42s/it] 36%\|███▋ \| 195/536 [19:13<41:27, 7.30s/it] {'loss': '0.462', 'grad_norm': '0.2158', 'learning_rate': '8.041e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6567', 'tokens/total': 25559040, 'tokens/trainable': 23790730, 'epoch': '0.7443'}
	36%\|███▋ \| 195/536 [19:13<41:27, 7.30s/it] 37%\|███▋ \| 196/536 [19:16<32:54, 5.81s/it] 37%\|███▋ \| 197/536 [19:18<27:50, 4.93s/it] 37%\|███▋ \| 198/536 [19:21<23:17, 4.13s/it] 37%\|███▋ \| 199/536 [19:23<20:04, 3.57s/it] 37%\|███▋ \| 200/536 [19:25<17:43, 3.17s/it] {'loss': '0.4676', 'grad_norm': '0.2188', 'learning_rate': '7.91e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6880', 'tokens/total': 26214400, 'tokens/trainable': 24401252, 'epoch': '0.7634'}
	37%\|███▋ \| 200/536 [19:25<17:43, 3.17s/it] 38%\|███▊ \| 201/536 [19:27<16:11, 2.90s/it] 38%\|███▊ \| 202/536 [19:30<15:05, 2.71s/it] 38%\|███▊ \| 203/536 [19:32<14:15, 2.57s/it] 38%\|███▊ \| 204/536 [19:34<13:41, 2.47s/it] 38%\|███▊ \| 205/536 [19:37<13:22, 2.43s/it] {'loss': '0.4504', 'grad_norm': '0.2158', 'learning_rate': '7.776e-06', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6617', 'tokens/total': 26869760, 'tokens/trainable': 25010696, 'epoch': '0.7824'}
	38%\|███▊ \| 205/536 [19:37<13:22, 2.43s/it] 38%\|███▊ \| 206/536 [19:39<13:08, 2.39s/it] 39%\|███▊ \| 207/536 [19:41<12:51, 2.34s/it] 39%\|███▉ \| 208/536 [19:43<12:41, 2.32s/it] 39%\|███▉ \| 209/536 [19:46<12:40, 2.32s/it] 39%\|███▉ \| 210/536 [19:48<13:00, 2.39s/it] {'loss': '0.4614', 'grad_norm': '0.2295', 'learning_rate': '7.639e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5994', 'tokens/total': 27525120, 'tokens/trainable': 25617872, 'epoch': '0.8015'}
	39%\|███▉ \| 210/536 [19:48<13:00, 2.39s/it] 39%\|███▉ \| 211/536 [19:51<13:14, 2.44s/it] 40%\|███▉ \| 212/536 [19:53<12:58, 2.40s/it] 40%\|███▉ \| 213/536 [19:55<12:42, 2.36s/it] 40%\|███▉ \| 214/536 [19:58<12:29, 2.33s/it] 40%\|████ \| 215/536 [20:00<12:22, 2.31s/it] {'loss': '0.477', 'grad_norm': '0.2412', 'learning_rate': '7.5e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6697', 'tokens/total': 28180480, 'tokens/trainable': 26227438, 'epoch': '0.8206'}
	40%\|████ \| 215/536 [20:00<12:22, 2.31s/it] 40%\|████ \| 216/536 [20:02<12:22, 2.32s/it][2026-03-16 19:31:37,309] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-216

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.85s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.85s/it]
	40%\|████ \| 217/536 [21:40<2:45:11, 31.07s/it] 41%\|████ \| 218/536 [21:43<1:59:15, 22.50s/it] 41%\|████ \| 219/536 [21:45<1:26:49, 16.43s/it] 41%\|████ \| 220/536 [21:47<1:04:08, 12.18s/it] {'loss': '0.4535', 'grad_norm': '0.2148', 'learning_rate': '7.358e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 28835840, 'tokens/trainable': 26833456, 'epoch': '0.8397'}
	41%\|████ \| 220/536 [21:47<1:04:08, 12.18s/it] 41%\|████ \| 221/536 [21:50<48:25, 9.22s/it] 41%\|████▏ \| 222/536 [21:52<37:25, 7.15s/it] 42%\|████▏ \| 223/536 [21:54<29:44, 5.70s/it] 42%\|████▏ \| 224/536 [21:57<24:17, 4.67s/it] 42%\|████▏ \| 225/536 [21:59<20:30, 3.96s/it] {'loss': '0.4639', 'grad_norm': '0.2197', 'learning_rate': '7.213e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 29491200, 'tokens/trainable': 27444416, 'epoch': '0.8588'}
	42%\|████▏ \| 225/536 [21:59<20:30, 3.96s/it] 42%\|████▏ \| 226/536 [22:01<17:50, 3.45s/it] 42%\|████▏ \| 227/536 [22:03<15:56, 3.10s/it] 43%\|████▎ \| 228/536 [22:06<14:40, 2.86s/it] 43%\|████▎ \| 229/536 [22:08<14:24, 2.82s/it] 43%\|████▎ \| 230/536 [22:11<14:01, 2.75s/it] {'loss': '0.4578', 'grad_norm': '0.2217', 'learning_rate': '7.066e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5816', 'tokens/total': 30146560, 'tokens/trainable': 28048432, 'epoch': '0.8779'}
	43%\|████▎ \| 230/536 [22:11<14:01, 2.75s/it] 43%\|████▎ \| 231/536 [22:13<13:21, 2.63s/it] 43%\|████▎ \| 232/536 [22:16<12:45, 2.52s/it] 43%\|████▎ \| 233/536 [22:18<12:29, 2.47s/it] 44%\|████▎ \| 234/536 [22:20<12:15, 2.44s/it] 44%\|████▍ \| 235/536 [22:23<11:59, 2.39s/it] {'loss': '0.4497', 'grad_norm': '0.2354', 'learning_rate': '6.917e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6580', 'tokens/total': 30801920, 'tokens/trainable': 28655952, 'epoch': '0.8969'}
	44%\|████▍ \| 235/536 [22:23<11:59, 2.39s/it] 44%\|████▍ \| 236/536 [22:25<11:55, 2.39s/it] 44%\|████▍ \| 237/536 [22:27<11:47, 2.37s/it] 44%\|████▍ \| 238/536 [22:30<11:38, 2.34s/it] 45%\|████▍ \| 239/536 [22:32<11:38, 2.35s/it] 45%\|████▍ \| 240/536 [22:34<11:23, 2.31s/it] {'loss': '0.4693', 'grad_norm': '0.2275', 'learning_rate': '6.766e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 31457280, 'tokens/trainable': 29262050, 'epoch': '0.916'}
	45%\|████▍ \| 240/536 [22:34<11:23, 2.31s/it] 45%\|████▍ \| 241/536 [22:37<11:19, 2.30s/it] 45%\|████▌ \| 242/536 [22:39<11:12, 2.29s/it] 45%\|████▌ \| 243/536 [22:41<11:10, 2.29s/it][2026-03-16 19:34:16,197] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-243

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.81s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.82s/it]
	46%\|████▌ \| 244/536 [24:21<2:33:31, 31.55s/it] 46%\|████▌ \| 245/536 [24:23<1:50:23, 22.76s/it] {'loss': '0.4629', 'grad_norm': '0.2178', 'learning_rate': '6.613e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 32112640, 'tokens/trainable': 29868356, 'epoch': '0.9351'}
	46%\|████▌ \| 245/536 [24:23<1:50:23, 22.76s/it] 46%\|████▌ \| 246/536 [24:25<1:20:19, 16.62s/it] 46%\|████▌ \| 247/536 [24:28<59:24, 12.33s/it] 46%\|████▋ \| 248/536 [24:30<45:07, 9.40s/it] 46%\|████▋ \| 249/536 [24:33<34:46, 7.27s/it] 47%\|████▋ \| 250/536 [24:35<27:30, 5.77s/it] {'loss': '0.474', 'grad_norm': '0.2539', 'learning_rate': '6.458e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6714', 'tokens/total': 32768000, 'tokens/trainable': 30473100, 'epoch': '0.9542'}
	47%\|████▋ \| 250/536 [24:35<27:30, 5.77s/it] 47%\|████▋ \| 251/536 [24:37<22:25, 4.72s/it] 47%\|████▋ \| 252/536 [24:39<18:50, 3.98s/it] 47%\|████▋ \| 253/536 [24:42<16:27, 3.49s/it] 47%\|████▋ \| 254/536 [24:44<14:46, 3.14s/it] 48%\|████▊ \| 255/536 [24:46<13:26, 2.87s/it] {'loss': '0.467', 'grad_norm': '0.2305', 'learning_rate': '6.302e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6772', 'tokens/total': 33423360, 'tokens/trainable': 31078478, 'epoch': '0.9733'}
	48%\|████▊ \| 255/536 [24:46<13:26, 2.87s/it] 48%\|████▊ \| 256/536 [24:49<12:32, 2.69s/it] 48%\|████▊ \| 257/536 [24:51<11:59, 2.58s/it] 48%\|████▊ \| 258/536 [24:53<11:39, 2.52s/it] 48%\|████▊ \| 259/536 [24:56<11:19, 2.45s/it] 49%\|████▊ \| 260/536 [24:58<11:02, 2.40s/it] {'loss': '0.4511', 'grad_norm': '0.2148', 'learning_rate': '6.144e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6619', 'tokens/total': 34078720, 'tokens/trainable': 31682612, 'epoch': '0.9924'}
	49%\|████▊ \| 260/536 [24:58<11:02, 2.40s/it] 49%\|████▊ \| 261/536 [25:00<10:50, 2.36s/it] 49%\|████▉ \| 262/536 [25:03<10:56, 2.40s/it] 49%\|████▉ \| 263/536 [25:06<12:20, 2.71s/it] 49%\|████▉ \| 264/536 [25:08<11:41, 2.58s/it] 49%\|████▉ \| 265/536 [25:11<11:22, 2.52s/it] {'loss': '0.4682', 'grad_norm': '0.2451', 'learning_rate': '5.985e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6472', 'tokens/total': 34734080, 'tokens/trainable': 32293470, 'epoch': '1.011'}
	49%\|████▉ \| 265/536 [25:11<11:22, 2.52s/it] 50%\|████▉ \| 266/536 [25:13<11:14, 2.50s/it] 50%\|████▉ \| 267/536 [25:16<11:28, 2.56s/it] 50%\|█████ \| 268/536 [25:18<11:16, 2.52s/it] 50%\|█████ \| 269/536 [25:21<10:53, 2.45s/it] 50%\|█████ \| 270/536 [25:23<10:39, 2.40s/it] {'loss': '0.461', 'grad_norm': '0.2207', 'learning_rate': '5.826e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6685', 'tokens/total': 35389440, 'tokens/trainable': 32904464, 'epoch': '1.031'}
	50%\|█████ \| 270/536 [25:23<10:39, 2.40s/it][2026-03-16 19:36:59,256] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-270

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.30s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.30s/it]
	51%\|█████ \| 271/536 [27:02<2:18:44, 31.41s/it] 51%\|█████ \| 272/536 [27:04<1:39:50, 22.69s/it] 51%\|█████ \| 273/536 [27:07<1:12:35, 16.56s/it] 51%\|█████ \| 274/536 [27:09<53:33, 12.26s/it] 51%\|█████▏ \| 275/536 [27:11<40:23, 9.29s/it] {'loss': '0.4545', 'grad_norm': '0.2324', 'learning_rate': '5.665e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6568', 'tokens/total': 36044800, 'tokens/trainable': 33517832, 'epoch': '1.05'}
	51%\|█████▏ \| 275/536 [27:11<40:23, 9.29s/it] 51%\|█████▏ \| 276/536 [27:15<32:37, 7.53s/it] 52%\|█████▏ \| 277/536 [27:17<25:45, 5.97s/it] 52%\|█████▏ \| 278/536 [27:19<20:59, 4.88s/it] 52%\|█████▏ \| 279/536 [27:22<17:35, 4.11s/it] 52%\|█████▏ \| 280/536 [27:24<15:12, 3.56s/it] {'loss': '0.447', 'grad_norm': '0.2158', 'learning_rate': '5.503e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 36700160, 'tokens/trainable': 34129632, 'epoch': '1.069'}
	52%\|█████▏ \| 280/536 [27:24<15:12, 3.56s/it] 52%\|█████▏ \| 281/536 [27:26<13:35, 3.20s/it] 53%\|█████▎ \| 282/536 [27:28<12:20, 2.91s/it] 53%\|█████▎ \| 283/536 [27:31<11:30, 2.73s/it] 53%\|█████▎ \| 284/536 [27:33<10:54, 2.60s/it] 53%\|█████▎ \| 285/536 [27:36<10:51, 2.60s/it] {'loss': '0.4378', 'grad_norm': '0.2119', 'learning_rate': '5.341e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5854', 'tokens/total': 37355520, 'tokens/trainable': 34742888, 'epoch': '1.088'}
	53%\|█████▎ \| 285/536 [27:36<10:51, 2.60s/it] 53%\|█████▎ \| 286/536 [27:38<10:42, 2.57s/it] 54%\|█████▎ \| 287/536 [27:41<10:26, 2.51s/it] 54%\|█████▎ \| 288/536 [27:43<10:02, 2.43s/it] 54%\|█████▍ \| 289/536 [27:45<09:45, 2.37s/it] 54%\|█████▍ \| 290/536 [27:47<09:40, 2.36s/it] {'loss': '0.4756', 'grad_norm': '0.2246', 'learning_rate': '5.179e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6534', 'tokens/total': 38010880, 'tokens/trainable': 35352848, 'epoch': '1.107'}
	54%\|█████▍ \| 290/536 [27:47<09:40, 2.36s/it] 54%\|█████▍ \| 291/536 [27:50<09:33, 2.34s/it] 54%\|█████▍ \| 292/536 [27:52<09:27, 2.33s/it] 55%\|█████▍ \| 293/536 [27:54<09:23, 2.32s/it] 55%\|█████▍ \| 294/536 [27:57<09:23, 2.33s/it] 55%\|█████▌ \| 295/536 [27:59<09:21, 2.33s/it] {'loss': '0.4635', 'grad_norm': '0.2188', 'learning_rate': '5.016e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6541', 'tokens/total': 38666240, 'tokens/trainable': 35964736, 'epoch': '1.126'}
	55%\|█████▌ \| 295/536 [27:59<09:21, 2.33s/it] 55%\|█████▌ \| 296/536 [28:01<09:16, 2.32s/it] 55%\|█████▌ \| 297/536 [28:03<09:09, 2.30s/it][2026-03-16 19:39:38,467] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-297

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.82s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.82s/it]
	56%\|█████▌ \| 298/536 [29:43<2:05:14, 31.57s/it] 56%\|█████▌ \| 299/536 [29:46<1:30:06, 22.81s/it] 56%\|█████▌ \| 300/536 [29:48<1:05:28, 16.65s/it] {'loss': '0.4578', 'grad_norm': '0.2334', 'learning_rate': '4.854e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6804', 'tokens/total': 39321600, 'tokens/trainable': 36579308, 'epoch': '1.145'}
	56%\|█████▌ \| 300/536 [29:48<1:05:28, 16.65s/it] 56%\|█████▌ \| 301/536 [29:50<48:19, 12.34s/it] 56%\|█████▋ \| 302/536 [29:53<36:18, 9.31s/it] 57%\|█████▋ \| 303/536 [29:55<27:59, 7.21s/it] 57%\|█████▋ \| 304/536 [29:57<22:22, 5.78s/it] 57%\|█████▋ \| 305/536 [30:00<18:50, 4.89s/it] {'loss': '0.4526', 'grad_norm': '0.2129', 'learning_rate': '4.691e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6214', 'tokens/total': 39976960, 'tokens/trainable': 37187912, 'epoch': '1.164'}
	57%\|█████▋ \| 305/536 [30:00<18:50, 4.89s/it] 57%\|█████▋ \| 306/536 [30:02<15:44, 4.11s/it] 57%\|█████▋ \| 307/536 [30:05<13:31, 3.55s/it] 57%\|█████▋ \| 308/536 [30:07<12:06, 3.19s/it] 58%\|█████▊ \| 309/536 [30:09<11:00, 2.91s/it] 58%\|█████▊ \| 310/536 [30:12<10:18, 2.74s/it] {'loss': '0.4482', 'grad_norm': '0.21', 'learning_rate': '4.529e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6526', 'tokens/total': 40632320, 'tokens/trainable': 37799984, 'epoch': '1.183'}
	58%\|█████▊ \| 310/536 [30:12<10:18, 2.74s/it] 58%\|█████▊ \| 311/536 [30:14<09:49, 2.62s/it] 58%\|█████▊ \| 312/536 [30:16<09:25, 2.53s/it] 58%\|█████▊ \| 313/536 [30:18<09:07, 2.45s/it] 59%\|█████▊ \| 314/536 [30:21<09:33, 2.58s/it] 59%\|█████▉ \| 315/536 [30:24<09:07, 2.48s/it] {'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '4.368e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6899', 'tokens/total': 41287680, 'tokens/trainable': 38409832, 'epoch': '1.202'}
	59%\|█████▉ \| 315/536 [30:24<09:07, 2.48s/it] 59%\|█████▉ \| 316/536 [30:26<08:55, 2.43s/it] 59%\|█████▉ \| 317/536 [30:28<08:44, 2.39s/it] 59%\|█████▉ \| 318/536 [30:31<08:35, 2.36s/it] 60%\|█████▉ \| 319/536 [30:33<08:25, 2.33s/it] 60%\|█████▉ \| 320/536 [30:35<08:19, 2.31s/it] {'loss': '0.4539', 'grad_norm': '0.2285', 'learning_rate': '4.207e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6757', 'tokens/total': 41943040, 'tokens/trainable': 39020096, 'epoch': '1.221'}
	60%\|█████▉ \| 320/536 [30:35<08:19, 2.31s/it] 60%\|█████▉ \| 321/536 [30:37<08:17, 2.31s/it] 60%\|██████ \| 322/536 [30:40<08:09, 2.29s/it] 60%\|██████ \| 323/536 [30:42<08:19, 2.35s/it] 60%\|██████ \| 324/536 [30:45<08:26, 2.39s/it][2026-03-16 19:42:20,100] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-324

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.49s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.49s/it]
	61%\|██████ \| 325/536 [32:22<1:48:54, 30.97s/it] {'loss': '0.4481', 'grad_norm': '0.2246', 'learning_rate': '4.046e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6887', 'tokens/total': 42598400, 'tokens/trainable': 39629160, 'epoch': '1.24'}
	61%\|██████ \| 325/536 [32:22<1:48:54, 30.97s/it] 61%\|██████ \| 326/536 [32:24<1:18:13, 22.35s/it] 61%\|██████ \| 327/536 [32:27<56:54, 16.34s/it] 61%\|██████ \| 328/536 [32:29<42:00, 12.12s/it] 61%\|██████▏ \| 329/536 [32:31<31:43, 9.20s/it] 62%\|██████▏ \| 330/536 [32:34<24:27, 7.12s/it] {'loss': '0.4542', 'grad_norm': '0.2256', 'learning_rate': '3.887e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6731', 'tokens/total': 43253760, 'tokens/trainable': 40237288, 'epoch': '1.26'}
	62%\|██████▏ \| 330/536 [32:34<24:27, 7.12s/it] 62%\|██████▏ \| 331/536 [32:36<19:21, 5.67s/it] 62%\|██████▏ \| 332/536 [32:39<16:57, 4.99s/it] 62%\|██████▏ \| 333/536 [32:42<14:06, 4.17s/it] 62%\|██████▏ \| 334/536 [32:44<12:07, 3.60s/it] 62%\|██████▎ \| 335/536 [32:46<10:43, 3.20s/it] {'loss': '0.4412', 'grad_norm': '0.2539', 'learning_rate': '3.729e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6745', 'tokens/total': 43909120, 'tokens/trainable': 40848032, 'epoch': '1.279'}
	62%\|██████▎ \| 335/536 [32:46<10:43, 3.20s/it] 63%\|██████▎ \| 336/536 [32:48<09:46, 2.93s/it] 63%\|██████▎ \| 337/536 [32:51<09:05, 2.74s/it] 63%\|██████▎ \| 338/536 [32:53<08:37, 2.61s/it] 63%\|██████▎ \| 339/536 [32:55<08:14, 2.51s/it] 63%\|██████▎ \| 340/536 [32:58<08:04, 2.47s/it] {'loss': '0.4615', 'grad_norm': '0.2217', 'learning_rate': '3.573e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6370', 'tokens/total': 44564480, 'tokens/trainable': 41457624, 'epoch': '1.298'}
	63%\|██████▎ \| 340/536 [32:58<08:04, 2.47s/it] 64%\|██████▎ \| 341/536 [33:00<08:05, 2.49s/it] 64%\|██████▍ \| 342/536 [33:03<07:51, 2.43s/it] 64%\|██████▍ \| 343/536 [33:05<07:49, 2.43s/it] 64%\|██████▍ \| 344/536 [33:08<07:51, 2.46s/it] 64%\|██████▍ \| 345/536 [33:10<07:38, 2.40s/it] {'loss': '0.4599', 'grad_norm': '0.2188', 'learning_rate': '3.418e-06', 'ppl': '1.584', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6795', 'tokens/total': 45219840, 'tokens/trainable': 42069272, 'epoch': '1.317'}
	64%\|██████▍ \| 345/536 [33:10<07:38, 2.40s/it] 65%\|██████▍ \| 346/536 [33:12<07:29, 2.37s/it] 65%\|██████▍ \| 347/536 [33:14<07:26, 2.36s/it] 65%\|██████▍ \| 348/536 [33:17<07:20, 2.34s/it] 65%\|██████▌ \| 349/536 [33:19<07:12, 2.32s/it] 65%\|██████▌ \| 350/536 [33:21<07:09, 2.31s/it] {'loss': '0.4499', 'grad_norm': '0.2148', 'learning_rate': '3.264e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6691', 'tokens/total': 45875200, 'tokens/trainable': 42681132, 'epoch': '1.336'}
	65%\|██████▌ \| 350/536 [33:21<07:09, 2.31s/it] 65%\|██████▌ \| 351/536 [33:24<07:04, 2.29s/it][2026-03-16 19:44:58,459] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-351

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.55s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.55s/it]
	66%\|██████▌ \| 352/536 [35:02<1:35:17, 31.07s/it] 66%\|██████▌ \| 353/536 [35:04<1:08:31, 22.47s/it] 66%\|██████▌ \| 354/536 [35:06<49:47, 16.42s/it] 66%\|██████▌ \| 355/536 [35:09<36:49, 12.21s/it] {'loss': '0.4529', 'grad_norm': '0.249', 'learning_rate': '3.113e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6371', 'tokens/total': 46530560, 'tokens/trainable': 43292904, 'epoch': '1.355'}
	66%\|██████▌ \| 355/536 [35:09<36:49, 12.21s/it] 66%\|██████▋ \| 356/536 [35:11<27:37, 9.21s/it] 67%\|██████▋ \| 357/536 [35:13<21:14, 7.12s/it] 67%\|██████▋ \| 358/536 [35:16<16:48, 5.67s/it] 67%\|██████▋ \| 359/536 [35:18<13:56, 4.73s/it] 67%\|██████▋ \| 360/536 [35:21<11:51, 4.04s/it] {'loss': '0.4461', 'grad_norm': '0.2207', 'learning_rate': '2.963e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6150', 'tokens/total': 47185920, 'tokens/trainable': 43900272, 'epoch': '1.374'}
	67%\|██████▋ \| 360/536 [35:21<11:51, 4.04s/it] 67%\|██████▋ \| 361/536 [35:23<10:14, 3.51s/it] 68%\|██████▊ \| 362/536 [35:25<09:24, 3.25s/it] 68%\|██████▊ \| 363/536 [35:28<08:33, 2.97s/it] 68%\|██████▊ \| 364/536 [35:30<07:52, 2.75s/it] 68%\|██████▊ \| 365/536 [35:32<07:23, 2.59s/it] {'loss': '0.4581', 'grad_norm': '0.3555', 'learning_rate': '2.816e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6767', 'tokens/total': 47841280, 'tokens/trainable': 44509872, 'epoch': '1.393'}
	68%\|██████▊ \| 365/536 [35:32<07:23, 2.59s/it] 68%\|██████▊ \| 366/536 [35:34<07:04, 2.50s/it] 68%\|██████▊ \| 367/536 [35:37<06:49, 2.42s/it] 69%\|██████▊ \| 368/536 [35:39<06:39, 2.38s/it] 69%\|██████▉ \| 369/536 [35:41<06:33, 2.36s/it] 69%\|██████▉ \| 370/536 [35:44<06:33, 2.37s/it] {'loss': '0.4483', 'grad_norm': '0.2109', 'learning_rate': '2.671e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6359', 'tokens/total': 48496640, 'tokens/trainable': 45121444, 'epoch': '1.412'}
	69%\|██████▉ \| 370/536 [35:44<06:33, 2.37s/it] 69%\|██████▉ \| 371/536 [35:46<06:27, 2.35s/it] 69%\|██████▉ \| 372/536 [35:48<06:25, 2.35s/it] 70%\|██████▉ \| 373/536 [35:51<06:17, 2.31s/it] 70%\|██████▉ \| 374/536 [35:53<06:12, 2.30s/it] 70%\|██████▉ \| 375/536 [35:55<06:13, 2.32s/it] {'loss': '0.4475', 'grad_norm': '0.2617', 'learning_rate': '2.528e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6505', 'tokens/total': 49152000, 'tokens/trainable': 45733296, 'epoch': '1.431'}
	70%\|██████▉ \| 375/536 [35:55<06:13, 2.32s/it] 70%\|███████ \| 376/536 [35:58<06:09, 2.31s/it] 70%\|███████ \| 377/536 [36:00<06:26, 2.43s/it] 71%\|███████ \| 378/536 [36:03<06:17, 2.39s/it][2026-03-16 19:47:37,290] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-378

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.77s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.77s/it]
	71%\|███████ \| 379/536 [37:39<1:20:25, 30.73s/it] 71%\|███████ \| 380/536 [37:42<57:44, 22.21s/it] {'loss': '0.4467', 'grad_norm': '0.208', 'learning_rate': '2.388e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6639', 'tokens/total': 49807360, 'tokens/trainable': 46341868, 'epoch': '1.45'}
	71%\|███████ \| 380/536 [37:42<57:44, 22.21s/it] 71%\|███████ \| 381/536 [37:44<41:54, 16.22s/it] 71%\|███████▏ \| 382/536 [37:46<31:01, 12.09s/it] 71%\|███████▏ \| 383/536 [37:49<23:20, 9.16s/it] 72%\|███████▏ \| 384/536 [37:51<17:57, 7.09s/it] 72%\|███████▏ \| 385/536 [37:53<14:15, 5.66s/it] {'loss': '0.4373', 'grad_norm': '0.2129', 'learning_rate': '2.251e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6487', 'tokens/total': 50462720, 'tokens/trainable': 46948144, 'epoch': '1.469'}
	72%\|███████▏ \| 385/536 [37:53<14:15, 5.66s/it] 72%\|███████▏ \| 386/536 [37:56<11:39, 4.67s/it] 72%\|███████▏ \| 387/536 [37:58<09:46, 3.94s/it] 72%\|███████▏ \| 388/536 [38:00<08:31, 3.45s/it] 73%\|███████▎ \| 389/536 [38:03<07:40, 3.14s/it] 73%\|███████▎ \| 390/536 [38:05<07:00, 2.88s/it] {'loss': '0.452', 'grad_norm': '0.2314', 'learning_rate': '2.117e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 51118080, 'tokens/trainable': 47558056, 'epoch': '1.489'}
	73%\|███████▎ \| 390/536 [38:05<07:00, 2.88s/it] 73%\|███████▎ \| 391/536 [38:08<07:22, 3.05s/it] 73%\|███████▎ \| 392/536 [38:11<06:49, 2.85s/it] 73%\|███████▎ \| 393/536 [38:13<06:22, 2.68s/it] 74%\|███████▎ \| 394/536 [38:15<06:02, 2.55s/it] 74%\|███████▎ \| 395/536 [38:18<06:01, 2.56s/it] {'loss': '0.4435', 'grad_norm': '0.2139', 'learning_rate': '1.985e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5877', 'tokens/total': 51773440, 'tokens/trainable': 48168520, 'epoch': '1.508'}
	74%\|███████▎ \| 395/536 [38:18<06:01, 2.56s/it] 74%\|███████▍ \| 396/536 [38:20<05:47, 2.48s/it] 74%\|███████▍ \| 397/536 [38:22<05:35, 2.42s/it] 74%\|███████▍ \| 398/536 [38:25<05:27, 2.37s/it] 74%\|███████▍ \| 399/536 [38:27<05:22, 2.35s/it] 75%\|███████▍ \| 400/536 [38:29<05:21, 2.37s/it] {'loss': '0.4444', 'grad_norm': '0.2236', 'learning_rate': '1.857e-06', 'ppl': '1.56', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6406', 'tokens/total': 52428800, 'tokens/trainable': 48779416, 'epoch': '1.527'}
	75%\|███████▍ \| 400/536 [38:29<05:21, 2.37s/it] 75%\|███████▍ \| 401/536 [38:32<05:15, 2.34s/it] 75%\|███████▌ \| 402/536 [38:34<05:10, 2.32s/it] 75%\|███████▌ \| 403/536 [38:36<05:10, 2.34s/it] 75%\|███████▌ \| 404/536 [38:39<05:07, 2.33s/it] 76%\|███████▌ \| 405/536 [38:41<05:03, 2.32s/it] {'loss': '0.4557', 'grad_norm': '0.2324', 'learning_rate': '1.732e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6719', 'tokens/total': 53084160, 'tokens/trainable': 49387032, 'epoch': '1.546'}
	76%\|███████▌ \| 405/536 [38:41<05:03, 2.32s/it][2026-03-16 19:50:15,755] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-405

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.46s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.46s/it]
	76%\|███████▌ \| 406/536 [40:19<1:06:58, 30.91s/it] 76%\|███████▌ \| 407/536 [40:21<48:00, 22.33s/it] 76%\|███████▌ \| 408/536 [40:23<34:49, 16.33s/it] 76%\|███████▋ \| 409/536 [40:25<25:39, 12.12s/it] 76%\|███████▋ \| 410/536 [40:28<19:38, 9.36s/it] {'loss': '0.4617', 'grad_norm': '0.2168', 'learning_rate': '1.611e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5250', 'tokens/total': 53739520, 'tokens/trainable': 49995344, 'epoch': '1.565'}
	76%\|███████▋ \| 410/536 [40:28<19:38, 9.36s/it] 77%\|███████▋ \| 411/536 [40:31<15:03, 7.23s/it] 77%\|███████▋ \| 412/536 [40:33<11:54, 5.76s/it] 77%\|███████▋ \| 413/536 [40:36<09:53, 4.83s/it] 77%\|███████▋ \| 414/536 [40:38<08:16, 4.07s/it] 77%\|███████▋ \| 415/536 [40:40<07:07, 3.53s/it] {'loss': '0.4492', 'grad_norm': '0.2217', 'learning_rate': '1.493e-06', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6671', 'tokens/total': 54394880, 'tokens/trainable': 50603264, 'epoch': '1.584'}
	77%\|███████▋ \| 415/536 [40:40<07:07, 3.53s/it] 78%\|███████▊ \| 416/536 [40:43<06:19, 3.17s/it] 78%\|███████▊ \| 417/536 [40:45<05:45, 2.90s/it] 78%\|███████▊ \| 418/536 [40:47<05:26, 2.77s/it] 78%\|███████▊ \| 419/536 [40:50<05:18, 2.73s/it] 78%\|███████▊ \| 420/536 [40:52<05:02, 2.61s/it] {'loss': '0.4522', 'grad_norm': '0.2676', 'learning_rate': '1.379e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 55050240, 'tokens/trainable': 51213612, 'epoch': '1.603'}
	78%\|███████▊ \| 420/536 [40:52<05:02, 2.61s/it] 79%\|███████▊ \| 421/536 [40:55<04:49, 2.52s/it] 79%\|███████▊ \| 422/536 [40:57<04:40, 2.46s/it] 79%\|███████▉ \| 423/536 [40:59<04:40, 2.49s/it] 79%\|███████▉ \| 424/536 [41:02<04:29, 2.41s/it] 79%\|███████▉ \| 425/536 [41:04<04:21, 2.36s/it] {'loss': '0.4414', 'grad_norm': '0.2168', 'learning_rate': '1.269e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6755', 'tokens/total': 55705600, 'tokens/trainable': 51819592, 'epoch': '1.622'}
	79%\|███████▉ \| 425/536 [41:04<04:21, 2.36s/it] 79%\|███████▉ \| 426/536 [41:06<04:17, 2.34s/it] 80%\|███████▉ \| 427/536 [41:08<04:13, 2.32s/it] 80%\|███████▉ \| 428/536 [41:11<04:08, 2.30s/it] 80%\|████████ \| 429/536 [41:13<04:07, 2.31s/it] 80%\|████████ \| 430/536 [41:15<04:04, 2.31s/it] {'loss': '0.4532', 'grad_norm': '0.2217', 'learning_rate': '1.163e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6642', 'tokens/total': 56360960, 'tokens/trainable': 52431520, 'epoch': '1.641'}
	80%\|████████ \| 430/536 [41:15<04:04, 2.31s/it] 80%\|████████ \| 431/536 [41:18<04:10, 2.38s/it] 81%\|████████ \| 432/536 [41:20<04:04, 2.35s/it][2026-03-16 19:52:55,057] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-432

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.45s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.45s/it]
	81%\|████████ \| 433/536 [42:58<53:14, 31.01s/it] 81%\|████████ \| 434/536 [43:00<38:05, 22.41s/it] 81%\|████████ \| 435/536 [43:03<27:33, 16.37s/it] {'loss': '0.4605', 'grad_norm': '0.3574', 'learning_rate': '1.061e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 57016320, 'tokens/trainable': 53041944, 'epoch': '1.66'}
	81%\|████████ \| 435/536 [43:03<27:33, 16.37s/it] 81%\|████████▏ \| 436/536 [43:05<20:20, 12.20s/it] 82%\|████████▏ \| 437/536 [43:07<15:11, 9.20s/it] 82%\|████████▏ \| 438/536 [43:10<11:41, 7.16s/it] 82%\|████████▏ \| 439/536 [43:12<09:11, 5.68s/it] 82%\|████████▏ \| 440/536 [43:14<07:29, 4.68s/it] {'loss': '0.446', 'grad_norm': '0.2119', 'learning_rate': '9.626e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6513', 'tokens/total': 57671680, 'tokens/trainable': 53647180, 'epoch': '1.679'}
	82%\|████████▏ \| 440/536 [43:14<07:29, 4.68s/it] 82%\|████████▏ \| 441/536 [43:17<06:23, 4.04s/it] 82%\|████████▏ \| 442/536 [43:19<05:31, 3.53s/it] 83%\|████████▎ \| 443/536 [43:21<04:53, 3.16s/it] 83%\|████████▎ \| 444/536 [43:24<04:26, 2.90s/it] 83%\|████████▎ \| 445/536 [43:26<04:04, 2.69s/it] {'loss': '0.4299', 'grad_norm': '0.2188', 'learning_rate': '8.688e-07', 'ppl': '1.537', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6951', 'tokens/total': 58327040, 'tokens/trainable': 54256432, 'epoch': '1.698'}
	83%\|████████▎ \| 445/536 [43:26<04:04, 2.69s/it] 83%\|████████▎ \| 446/536 [43:28<03:52, 2.58s/it] 83%\|████████▎ \| 447/536 [43:31<03:41, 2.49s/it] 84%\|████████▎ \| 448/536 [43:33<03:32, 2.41s/it] 84%\|████████▍ \| 449/536 [43:35<03:30, 2.42s/it] 84%\|████████▍ \| 450/536 [43:38<03:23, 2.37s/it] {'loss': '0.4583', 'grad_norm': '0.2188', 'learning_rate': '7.794e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6777', 'tokens/total': 58982400, 'tokens/trainable': 54863148, 'epoch': '1.718'}
	84%\|████████▍ \| 450/536 [43:38<03:23, 2.37s/it] 84%\|████████▍ \| 451/536 [43:40<03:28, 2.46s/it] 84%\|████████▍ \| 452/536 [43:43<03:24, 2.44s/it] 85%\|████████▍ \| 453/536 [43:45<03:19, 2.40s/it] 85%\|████████▍ \| 454/536 [43:47<03:15, 2.38s/it] 85%\|████████▍ \| 455/536 [43:50<03:12, 2.38s/it] {'loss': '0.4523', 'grad_norm': '0.2119', 'learning_rate': '6.945e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6454', 'tokens/total': 59637760, 'tokens/trainable': 55471580, 'epoch': '1.737'}
	85%\|████████▍ \| 455/536 [43:50<03:12, 2.38s/it] 85%\|████████▌ \| 456/536 [43:52<03:15, 2.44s/it] 85%\|████████▌ \| 457/536 [43:54<03:10, 2.41s/it] 85%\|████████▌ \| 458/536 [43:57<03:11, 2.46s/it] 86%\|████████▌ \| 459/536 [43:59<03:05, 2.41s/it][2026-03-16 19:55:34,637] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-459

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.63s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.63s/it]
	86%\|████████▌ \| 460/536 [45:37<39:11, 30.94s/it] {'loss': '0.4523', 'grad_norm': '0.2148', 'learning_rate': '6.141e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6711', 'tokens/total': 60293120, 'tokens/trainable': 56081832, 'epoch': '1.756'}
	86%\|████████▌ \| 460/536 [45:37<39:11, 30.94s/it] 86%\|████████▌ \| 461/536 [45:39<27:55, 22.34s/it] 86%\|████████▌ \| 462/536 [45:41<20:06, 16.30s/it] 86%\|████████▋ \| 463/536 [45:44<14:42, 12.09s/it] 87%\|████████▋ \| 464/536 [45:46<10:58, 9.14s/it] 87%\|████████▋ \| 465/536 [45:48<08:21, 7.06s/it] {'loss': '0.4461', 'grad_norm': '0.4551', 'learning_rate': '5.383e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6903', 'tokens/total': 60948480, 'tokens/trainable': 56694240, 'epoch': '1.775'}
	87%\|████████▋ \| 465/536 [45:48<08:21, 7.06s/it] 87%\|████████▋ \| 466/536 [45:50<06:33, 5.62s/it] 87%\|████████▋ \| 467/536 [45:53<05:23, 4.69s/it] 87%\|████████▋ \| 468/536 [45:55<04:29, 3.97s/it] 88%\|████████▊ \| 469/536 [45:57<03:50, 3.44s/it] 88%\|████████▊ \| 470/536 [46:01<03:45, 3.42s/it] {'loss': '0.4341', 'grad_norm': '0.208', 'learning_rate': '4.673e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4512', 'tokens/total': 61603840, 'tokens/trainable': 57301404, 'epoch': '1.794'}
	88%\|████████▊ \| 470/536 [46:01<03:45, 3.42s/it] 88%\|████████▊ \| 471/536 [46:03<03:20, 3.09s/it] 88%\|████████▊ \| 472/536 [46:05<03:02, 2.86s/it] 88%\|████████▊ \| 473/536 [46:08<02:50, 2.70s/it] 88%\|████████▊ \| 474/536 [46:10<02:39, 2.58s/it] 89%\|████████▊ \| 475/536 [46:12<02:34, 2.54s/it] {'loss': '0.4627', 'grad_norm': '0.2461', 'learning_rate': '4.011e-07', 'ppl': '1.588', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6249', 'tokens/total': 62259200, 'tokens/trainable': 57910216, 'epoch': '1.813'}
	89%\|████████▊ \| 475/536 [46:12<02:34, 2.54s/it] 89%\|████████▉ \| 476/536 [46:15<02:26, 2.45s/it] 89%\|████████▉ \| 477/536 [46:17<02:24, 2.45s/it] 89%\|████████▉ \| 478/536 [46:20<02:25, 2.51s/it] 89%\|████████▉ \| 479/536 [46:22<02:19, 2.45s/it] 90%\|████████▉ \| 480/536 [46:24<02:13, 2.39s/it] {'loss': '0.4538', 'grad_norm': '0.2178', 'learning_rate': '3.397e-07', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6635', 'tokens/total': 62914560, 'tokens/trainable': 58517712, 'epoch': '1.832'}
	90%\|████████▉ \| 480/536 [46:24<02:13, 2.39s/it] 90%\|████████▉ \| 481/536 [46:27<02:09, 2.36s/it] 90%\|████████▉ \| 482/536 [46:29<02:05, 2.33s/it] 90%\|█████████ \| 483/536 [46:31<02:03, 2.33s/it] 90%\|█████████ \| 484/536 [46:34<02:01, 2.33s/it] 90%\|█████████ \| 485/536 [46:36<02:04, 2.44s/it] {'loss': '0.4395', 'grad_norm': '0.208', 'learning_rate': '2.833e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6504', 'tokens/total': 63569920, 'tokens/trainable': 59125608, 'epoch': '1.851'}
	90%\|█████████ \| 485/536 [46:36<02:04, 2.44s/it] 91%\|█████████ \| 486/536 [46:39<01:59, 2.39s/it][2026-03-16 19:58:13,418] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-486

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.82s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.82s/it]
	91%\|█████████ \| 487/536 [48:16<25:09, 30.81s/it] 91%\|█████████ \| 488/536 [48:18<17:48, 22.26s/it] 91%\|█████████ \| 489/536 [48:20<12:44, 16.26s/it] 91%\|█████████▏\| 490/536 [48:24<09:32, 12.45s/it] {'loss': '0.4478', 'grad_norm': '0.2236', 'learning_rate': '2.318e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4290', 'tokens/total': 64225280, 'tokens/trainable': 59733412, 'epoch': '1.87'}
	91%\|█████████▏\| 490/536 [48:24<09:32, 12.45s/it] 92%\|█████████▏\| 491/536 [48:26<07:02, 9.39s/it] 92%\|█████████▏\| 492/536 [48:28<05:19, 7.26s/it] 92%\|█████████▏\| 493/536 [48:31<04:08, 5.78s/it] 92%\|█████████▏\| 494/536 [48:33<03:22, 4.82s/it] 92%\|█████████▏\| 495/536 [48:36<02:47, 4.08s/it] {'loss': '0.4362', 'grad_norm': '0.2129', 'learning_rate': '1.854e-07', 'ppl': '1.547', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6524', 'tokens/total': 64880640, 'tokens/trainable': 60339904, 'epoch': '1.889'}
	92%\|█████████▏\| 495/536 [48:36<02:47, 4.08s/it] 93%\|█████████▎\| 496/536 [48:38<02:25, 3.64s/it] 93%\|█████████▎\| 497/536 [48:40<02:06, 3.23s/it] 93%\|█████████▎\| 498/536 [48:43<01:52, 2.96s/it] 93%\|█████████▎\| 499/536 [48:45<01:42, 2.76s/it] 93%\|█████████▎\| 500/536 [48:47<01:34, 2.62s/it] {'loss': '0.4656', 'grad_norm': '0.2217', 'learning_rate': '1.441e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6649', 'tokens/total': 65536000, 'tokens/trainable': 60945412, 'epoch': '1.908'}
	93%\|█████████▎\| 500/536 [48:47<01:34, 2.62s/it] 93%\|█████████▎\| 501/536 [48:50<01:29, 2.55s/it] 94%\|█████████▎\| 502/536 [48:52<01:25, 2.52s/it] 94%\|█████████▍\| 503/536 [48:54<01:20, 2.44s/it] 94%\|█████████▍\| 504/536 [48:57<01:16, 2.39s/it] 94%\|█████████▍\| 505/536 [48:59<01:13, 2.36s/it] {'loss': '0.4466', 'grad_norm': '0.2129', 'learning_rate': '1.079e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6658', 'tokens/total': 66191360, 'tokens/trainable': 61550936, 'epoch': '1.927'}
	94%\|█████████▍\| 505/536 [48:59<01:13, 2.36s/it] 94%\|█████████▍\| 506/536 [49:01<01:09, 2.33s/it] 95%\|█████████▍\| 507/536 [49:04<01:07, 2.31s/it] 95%\|█████████▍\| 508/536 [49:06<01:04, 2.31s/it] 95%\|█████████▍\| 509/536 [49:08<01:04, 2.37s/it] 95%\|█████████▌\| 510/536 [49:11<01:01, 2.35s/it] {'loss': '0.4754', 'grad_norm': '0.2168', 'learning_rate': '7.691e-08', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 66846720, 'tokens/trainable': 62157088, 'epoch': '1.947'}
	95%\|█████████▌\| 510/536 [49:11<01:01, 2.35s/it] 95%\|█████████▌\| 511/536 [49:13<00:58, 2.34s/it] 96%\|█████████▌\| 512/536 [49:15<00:55, 2.32s/it] 96%\|█████████▌\| 513/536 [49:18<00:53, 2.31s/it][2026-03-16 20:00:52,618] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-513

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.79s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.79s/it]
	96%\|█████████▌\| 514/536 [50:57<11:33, 31.50s/it] 96%\|█████████▌\| 515/536 [51:00<07:58, 22.77s/it] {'loss': '0.4548', 'grad_norm': '0.2217', 'learning_rate': '5.11e-08', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6342', 'tokens/total': 67502080, 'tokens/trainable': 62762592, 'epoch': '1.966'}
	96%\|█████████▌\| 515/536 [51:00<07:58, 22.77s/it] 96%\|█████████▋\| 516/536 [51:02<05:32, 16.65s/it] 96%\|█████████▋\| 517/536 [51:04<03:54, 12.32s/it] 97%\|█████████▋\| 518/536 [51:06<02:47, 9.30s/it] 97%\|█████████▋\| 519/536 [51:09<02:02, 7.18s/it] 97%\|█████████▋\| 520/536 [51:11<01:32, 5.76s/it] {'loss': '0.4544', 'grad_norm': '0.2314', 'learning_rate': '3.054e-08', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6125', 'tokens/total': 68157440, 'tokens/trainable': 63366800, 'epoch': '1.985'}
	97%\|█████████▋\| 520/536 [51:11<01:32, 5.76s/it] 97%\|█████████▋\| 521/536 [51:13<01:10, 4.72s/it] 97%\|█████████▋\| 522/536 [51:16<00:55, 3.98s/it] 98%\|█████████▊\| 523/536 [51:18<00:45, 3.47s/it] 98%\|█████████▊\| 524/536 [51:20<00:38, 3.19s/it] 98%\|█████████▊\| 525/536 [51:24<00:35, 3.21s/it] {'loss': '0.4504', 'grad_norm': '0.2246', 'learning_rate': '1.522e-08', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6753', 'tokens/total': 68812800, 'tokens/trainable': 63975056, 'epoch': '2.004'}
	98%\|█████████▊\| 525/536 [51:24<00:35, 3.21s/it] 98%\|█████████▊\| 526/536 [51:26<00:29, 2.92s/it] 98%\|█████████▊\| 527/536 [51:28<00:24, 2.75s/it] 99%\|█████████▊\| 528/536 [51:31<00:21, 2.71s/it] 99%\|█████████▊\| 529/536 [51:33<00:18, 2.60s/it] 99%\|█████████▉\| 530/536 [51:36<00:15, 2.52s/it] {'loss': '0.4546', 'grad_norm': '0.2109', 'learning_rate': '5.182e-09', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6575', 'tokens/total': 69468160, 'tokens/trainable': 64586448, 'epoch': '2.023'}
	99%\|█████████▉\| 530/536 [51:36<00:15, 2.52s/it] 99%\|█████████▉\| 531/536 [51:38<00:12, 2.45s/it] 99%\|█████████▉\| 532/536 [51:40<00:09, 2.40s/it] 99%\|█████████▉\| 533/536 [51:43<00:07, 2.40s/it] 100%\|█████████▉\| 534/536 [51:45<00:04, 2.39s/it] 100%\|█████████▉\| 535/536 [51:47<00:02, 2.42s/it] {'loss': '0.4493', 'grad_norm': '0.2314', 'learning_rate': '4.231e-10', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6115', 'tokens/total': 70123520, 'tokens/trainable': 65199364, 'epoch': '2.042'}
	100%\|█████████▉\| 535/536 [51:47<00:02, 2.42s/it] 100%\|██████████\| 536/536 [51:50<00:00, 2.37s/it][2026-03-16 20:03:25,941] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-536

	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s][A
	Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.68s/it][A Writing model shards: 100%\|██████████\| 1/1 [00:16<00:00, 16.68s/it]
	{'train_runtime': '3210', 'train_samples_per_second': '2.672', 'train_steps_per_second': '0.167', 'train_loss': '0.4897', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'epoch': '2.046', 'tokens/train_per_sec_per_gpu': '6757'}
	100%\|██████████\| 536/536 [53:26<00:00, 2.37s/it] 100%\|██████████\| 536/536 [53:26<00:00, 5.98s/it]
	[2026-03-16 20:04:52,263] [INFO] [axolotl.train.save_trained_model:237] [PID:213] Training completed! Saving trained model to ./outputs/qwen3-sft-stmt-tk/.
	[2026-03-16 20:05:01,009] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/
	Writing model shards: 0%\| \| 0/1 [00:00<?, ?it/s] Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.53s/it] Writing model shards: 100%\|██████████\| 1/1 [00:17<00:00, 17.53s/it]
	[2026-03-16 20:05:19,091] [INFO] [axolotl.train.save_trained_model:351] [PID:213] Model successfully saved to ./outputs/qwen3-sft-stmt-tk/