Lean4-sft-tk-8b / debug.log
xiaolesu's picture
Upload folder using huggingface_hub
5270841 verified
[2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB ()
[2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config:
{
"activation_offloading": false,
"axolotl_config_path": "qwen3-sft-stmt-tk.yml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 16,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": true,
"n_gpu": 8,
"n_node": 1
},
"chat_template": "qwen3",
"chat_template_kwargs": {
"enable_thinking": false
},
"context_parallel_size": 1,
"dataloader_num_workers": 8,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 192,
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "xiaolesu/lean4-sft-stmt-tk",
"split": "train",
"trust_remote_code": false,
"type": "alpaca"
}
],
"ddp": true,
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 2,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"evals_per_epoch": 10,
"experimental_skip_move_to_device": true,
"flex_attention": true,
"flex_attn_compile_kwargs": {
"dynamic": false,
"mode": "max-autotune-no-cudagraphs"
},
"fp16": false,
"fsdp": [
"full_shard",
"auto_wrap"
],
"fsdp_config": {
"activation_checkpointing": true,
"auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"cpu_ram_efficient_loading": true,
"fsdp_version": 2,
"offload_params": false,
"reshard_after_forward": true,
"state_dict_type": "FULL_STATE_DICT",
"transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
},
"fsdp_version": 2,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": false,
"include_tkps": true,
"learning_rate": 1e-05,
"liger_fused_linear_cross_entropy": true,
"liger_glu_activation": true,
"liger_layer_norm": true,
"liger_rms_norm": true,
"liger_rope": true,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 5,
"lora_dropout": 0.0,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"micro_batch_size": 2,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_fused",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/qwen3-sft-stmt-tk/",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.liger.LigerPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.05,
"save_total_limit": 3,
"saves_per_epoch": 10,
"sequence_len": 8192,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"wandb_name": "qwen3-8b-tk-run1",
"wandb_project": "qwen3-sft-stmt-tk",
"warmup_ratio": 0.1,
"weight_decay": 0.0,
"world_size": 8
}
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets...
[2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s]
[2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 0%| | 0/11192 [00:00<?, ? examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 1%| | 59/11192 [00:02<06:34, 28.25 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 3%|β–Ž | 295/11192 [00:02<01:02, 175.65 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 6%|β–Œ | 649/11192 [00:02<00:23, 453.06 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 8%|β–Š | 885/11192 [00:02<00:16, 634.46 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 10%|β–ˆ | 1121/11192 [00:02<00:11, 849.04 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 13%|β–ˆβ–Ž | 1416/11192 [00:02<00:08, 1166.00 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 15%|β–ˆβ–Œ | 1711/11192 [00:02<00:06, 1480.17 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 18%|β–ˆβ–Š | 2006/11192 [00:02<00:05, 1697.58 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 21%|β–ˆβ–ˆ | 2301/11192 [00:02<00:04, 1949.74 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 23%|β–ˆβ–ˆβ–Ž | 2596/11192 [00:03<00:04, 2145.10 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 26%|β–ˆβ–ˆβ–Œ | 2891/11192 [00:03<00:03, 2324.57 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 29%|β–ˆβ–ˆβ–‰ | 3245/11192 [00:03<00:03, 2566.75 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 7828/11192 [00:03<00:00, 14035.00 examples/s] Dropping Invalid Sequences (<None or >8192) (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 11192/11192 [00:04<00:00, 2753.84 examples/s]
[2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192])
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/10830 [00:00<?, ? examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 1%| | 57/10830 [00:02<06:27, 27.78 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 3%|β–Ž | 285/10830 [00:02<01:00, 173.64 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 4%|▍ | 456/10830 [00:02<00:34, 299.77 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 6%|β–‹ | 684/10830 [00:02<00:20, 506.62 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 8%|β–Š | 912/10830 [00:02<00:13, 736.95 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 11%|β–ˆ | 1140/10830 [00:02<00:10, 947.17 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 13%|β–ˆβ–Ž | 1368/10830 [00:02<00:08, 1094.03 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 15%|β–ˆβ– | 1596/10830 [00:02<00:07, 1269.49 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 17%|β–ˆβ–‹ | 1824/10830 [00:02<00:06, 1437.65 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 19%|β–ˆβ–‰ | 2052/10830 [00:03<00:05, 1614.63 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 21%|β–ˆβ–ˆ | 2280/10830 [00:03<00:05, 1635.72 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 23%|β–ˆβ–ˆβ–Ž | 2508/10830 [00:03<00:04, 1732.21 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 25%|β–ˆβ–ˆβ–Œ | 2736/10830 [00:03<00:04, 1721.60 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 27%|β–ˆβ–ˆβ–‹ | 2964/10830 [00:03<00:04, 1703.27 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 29%|β–ˆβ–ˆβ–‰ | 3192/10830 [00:03<00:04, 1798.77 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 32%|β–ˆβ–ˆβ–ˆβ– | 3477/10830 [00:03<00:03, 1958.86 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 34%|β–ˆβ–ˆβ–ˆβ– | 3705/10830 [00:03<00:03, 2037.08 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 36%|β–ˆβ–ˆβ–ˆβ–‹ | 3933/10830 [00:04<00:03, 2067.96 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 38%|β–ˆβ–ˆβ–ˆβ–Š | 4161/10830 [00:04<00:03, 2091.19 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 4389/10830 [00:04<00:05, 1127.36 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4670/10830 [00:04<00:04, 1385.39 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4894/10830 [00:04<00:04, 1432.10 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 7526/10830 [00:04<00:00, 6499.14 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:05<00:00, 1931.57 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 0%| | 0/10830 [00:00<?, ? examples/s] Add position_id column (Sample Packing) (num_proc=192): 1%| | 57/10830 [00:02<06:33, 27.40 examples/s] Add position_id column (Sample Packing) (num_proc=192): 2%|▏ | 228/10830 [00:02<01:18, 135.14 examples/s] Add position_id column (Sample Packing) (num_proc=192): 4%|▍ | 456/10830 [00:02<00:33, 310.31 examples/s] Add position_id column (Sample Packing) (num_proc=192): 8%|β–Š | 912/10830 [00:02<00:14, 692.10 examples/s] Add position_id column (Sample Packing) (num_proc=192): 11%|β–ˆ | 1140/10830 [00:02<00:11, 858.26 examples/s] Add position_id column (Sample Packing) (num_proc=192): 13%|β–ˆβ–Ž | 1368/10830 [00:02<00:09, 1027.56 examples/s] Add position_id column (Sample Packing) (num_proc=192): 15%|β–ˆβ– | 1596/10830 [00:02<00:07, 1182.55 examples/s] Add position_id column (Sample Packing) (num_proc=192): 17%|β–ˆβ–‹ | 1881/10830 [00:02<00:06, 1425.26 examples/s] Add position_id column (Sample Packing) (num_proc=192): 20%|β–ˆβ–ˆ | 2166/10830 [00:03<00:05, 1604.97 examples/s] Add position_id column (Sample Packing) (num_proc=192): 22%|β–ˆβ–ˆβ– | 2394/10830 [00:03<00:04, 1738.29 examples/s] Add position_id column (Sample Packing) (num_proc=192): 25%|β–ˆβ–ˆβ– | 2679/10830 [00:03<00:04, 1951.23 examples/s] Add position_id column (Sample Packing) (num_proc=192): 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 6854/10830 [00:03<00:00, 11681.66 examples/s] Add position_id column (Sample Packing) (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:04<00:00, 2621.72 examples/s]
Saving the dataset (0/42 shards): 0%| | 0/10830 [00:00<?, ? examples/s] Saving the dataset (0/42 shards): 2%|▏ | 258/10830 [00:00<00:22, 464.02 examples/s] Saving the dataset (1/42 shards): 2%|▏ | 258/10830 [00:00<00:22, 464.02 examples/s] Saving the dataset (2/42 shards): 7%|β–‹ | 774/10830 [00:00<00:21, 464.02 examples/s] Saving the dataset (3/42 shards): 7%|β–‹ | 774/10830 [00:00<00:21, 464.02 examples/s] Saving the dataset (4/42 shards): 14%|β–ˆβ– | 1548/10830 [00:00<00:20, 464.02 examples/s] Saving the dataset (5/42 shards): 14%|β–ˆβ– | 1548/10830 [00:00<00:20, 464.02 examples/s] Saving the dataset (6/42 shards): 17%|β–ˆβ–‹ | 1806/10830 [00:00<00:19, 464.02 examples/s] Saving the dataset (7/42 shards): 19%|β–ˆβ–‰ | 2064/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (8/42 shards): 21%|β–ˆβ–ˆβ– | 2322/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (9/42 shards): 21%|β–ˆβ–ˆβ– | 2322/10830 [00:00<00:18, 464.02 examples/s] Saving the dataset (10/42 shards): 26%|β–ˆβ–ˆβ–Œ | 2838/10830 [00:00<00:17, 464.02 examples/s] Saving the dataset (11/42 shards): 29%|β–ˆβ–ˆβ–Š | 3096/10830 [00:00<00:16, 464.02 examples/s] Saving the dataset (12/42 shards): 31%|β–ˆβ–ˆβ–ˆ | 3354/10830 [00:00<00:16, 464.02 examples/s] Saving the dataset (13/42 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 3612/10830 [00:00<00:15, 464.02 examples/s] Saving the dataset (14/42 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 3612/10830 [00:00<00:15, 464.02 examples/s] Saving the dataset (15/42 shards): 38%|β–ˆβ–ˆβ–ˆβ–Š | 4128/10830 [00:00<00:14, 464.02 examples/s] Saving the dataset (16/42 shards): 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4386/10830 [00:00<00:13, 464.02 examples/s] Saving the dataset (17/42 shards): 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4386/10830 [00:00<00:13, 464.02 examples/s] Saving the dataset (18/42 shards): 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4902/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (19/42 shards): 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5160/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (20/42 shards): 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5160/10830 [00:00<00:12, 464.02 examples/s] Saving the dataset (21/42 shards): 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5676/10830 [00:00<00:11, 464.02 examples/s] Saving the dataset (22/42 shards): 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5676/10830 [00:00<00:11, 464.02 examples/s] Saving the dataset (23/42 shards): 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5934/10830 [00:00<00:10, 464.02 examples/s] Saving the dataset (24/42 shards): 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 6192/10830 [00:00<00:09, 464.02 examples/s] Saving the dataset (25/42 shards): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (26/42 shards): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (27/42 shards): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 6966/10830 [00:00<00:08, 464.02 examples/s] Saving the dataset (28/42 shards): 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 7224/10830 [00:00<00:07, 464.02 examples/s] Saving the dataset (29/42 shards): 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (30/42 shards): 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (31/42 shards): 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7998/10830 [00:00<00:06, 464.02 examples/s] Saving the dataset (32/42 shards): 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 8514/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (33/42 shards): 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8772/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (34/42 shards): 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8772/10830 [00:00<00:04, 464.02 examples/s] Saving the dataset (35/42 shards): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 9030/10830 [00:00<00:03, 464.02 examples/s] Saving the dataset (36/42 shards): 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 9545/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (37/42 shards): 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 9545/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (38/42 shards): 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9802/10830 [00:00<00:02, 464.02 examples/s] Saving the dataset (39/42 shards): 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 10316/10830 [00:00<00:01, 464.02 examples/s] Saving the dataset (40/42 shards): 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 10316/10830 [00:00<00:01, 464.02 examples/s] Saving the dataset (41/42 shards): 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 10573/10830 [00:00<00:00, 464.02 examples/s] Saving the dataset (42/42 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:00<00:00, 464.02 examples/s] Saving the dataset (42/42 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:00<00:00, 16314.56 examples/s]
[2026-03-16 19:08:54,045] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:213] total_num_tokens: 33_957_071
[2026-03-16 19:08:54,340] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:213] `total_supervised_tokens: 32_028_150`
[2026-03-16 19:08:55,893] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.7050187587738037
[2026-03-16 19:11:05,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2148, 2146, 2148, 2145, 2146, 2146, 2148, 2145]
[2026-03-16 19:11:06,172] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:213] data_loader_len: 268
[2026-03-16 19:11:06,189] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:213] sample_packing_eff_est across ranks: [0.9646614789962769, 0.9657852649688721, 0.9646614789962769, 0.9657852649688721, 0.9648860096931458, 0.9648860096931458, 0.9653354287147522, 0.9657852649688721]
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:213] sample_packing_eff_est: 0.97
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:213] total_num_steps: 536
[2026-03-16 19:11:06,192] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:213] Maximum number of steps set at 536
[2026-03-16 19:11:06,242] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:213] loading tokenizer... Qwen/Qwen3-8B
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model
[2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing...
[2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading weights: 0%| | 0/399 [00:00<?, ?it/s] Loading weights: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 399/399 [00:00<00:00, 9671.84it/s]
[2026-03-16 19:11:09,882] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:359] [PID:213] Converting modules to torch.bfloat16
[2026-03-16 19:11:09,885] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:213] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-16 19:11:11,696] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:213] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.Multiple deprecation warnings due to FSDP2 conversion:
sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-03-16 19:11:12,192] [INFO] [axolotl.train.save_initial_configs:417] [PID:213] Pre-saving tokenizer to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,283] [INFO] [axolotl.train.save_initial_configs:422] [PID:213] Pre-saving model config to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,286] [INFO] [axolotl.train.execute_training:218] [PID:213] Starting trainer...
[2026-03-16 19:11:14,793] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.9547648429870605
[2026-03-16 19:11:14,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2103, 2104, 2104, 2104, 2103, 2104, 2106, 2104]
[2026-03-16 19:11:15,013] [INFO] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:34] [PID:213] Broadcasting full state dict to all ranks...
[2026-03-16 19:11:22,269] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:86] [PID:213] Time taken to load full state dict: 7.26 seconds
[2026-03-16 19:11:22,270] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.log_gpu_memory_usage:127] [PID:213] Memory usage after broadcasting full state dict 3.067GB (+3.067GB allocated, +3.178GB reserved)
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
wandb: Currently logged in as: suxiaole0223 (suxiaole) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run kje10pck
wandb: Tracking run with wandb version 0.25.1
wandb: Run data is saved locally in /workspace/axolotl-workspace/wandb/run-20260316_191122-kje10pck
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run qwen3-8b-tk-run1
wandb: ⭐️ View project at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk
wandb: πŸš€ View run at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk/runs/kje10pck
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-16 19:11:25,554] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:213] The Axolotl config has been saved to the WandB run under files.
0%| | 0/536 [00:00<?, ?it/s][2026-03-16 19:11:57,210] [WARNING] [py.warnings._showwarnmsg:110] [PID:213] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
_warn_once(
0%| | 1/536 [00:40<6:03:21, 40.75s/it] 0%| | 2/536 [00:43<2:42:00, 18.20s/it] 1%| | 3/536 [00:45<1:37:15, 10.95s/it] 1%| | 4/536 [00:47<1:07:23, 7.60s/it] 1%| | 5/536 [00:50<50:28, 5.70s/it] {'loss': '0.8667', 'grad_norm': '2.609', 'learning_rate': '7.547e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6531', 'tokens/total': 655360, 'tokens/trainable': 611049, 'epoch': '0.01908'}
1%| | 5/536 [00:50<50:28, 5.70s/it] 1%| | 6/536 [00:52<40:15, 4.56s/it] 1%|▏ | 7/536 [00:55<34:02, 3.86s/it] 1%|▏ | 8/536 [00:57<30:00, 3.41s/it] 2%|▏ | 9/536 [00:59<26:45, 3.05s/it] 2%|▏ | 10/536 [01:02<24:45, 2.82s/it] {'loss': '0.8307', 'grad_norm': '2.5', 'learning_rate': '1.698e-06', 'ppl': '2.295', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6647', 'tokens/total': 1310720, 'tokens/trainable': 1224548, 'epoch': '0.03817'}
2%|▏ | 10/536 [01:02<24:45, 2.82s/it] 2%|▏ | 11/536 [01:04<23:13, 2.65s/it] 2%|▏ | 12/536 [01:06<22:04, 2.53s/it] 2%|▏ | 13/536 [01:08<21:32, 2.47s/it] 3%|β–Ž | 14/536 [01:11<21:27, 2.47s/it] 3%|β–Ž | 15/536 [01:13<21:28, 2.47s/it] {'loss': '0.8487', 'grad_norm': '2.453', 'learning_rate': '2.642e-06', 'ppl': '2.337', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6160', 'tokens/total': 1966080, 'tokens/trainable': 1834432, 'epoch': '0.05725'}
3%|β–Ž | 15/536 [01:13<21:28, 2.47s/it] 3%|β–Ž | 16/536 [01:16<21:18, 2.46s/it] 3%|β–Ž | 17/536 [01:18<20:51, 2.41s/it] 3%|β–Ž | 18/536 [01:20<20:44, 2.40s/it] 4%|β–Ž | 19/536 [01:23<21:59, 2.55s/it] 4%|β–Ž | 20/536 [01:26<21:40, 2.52s/it] {'loss': '0.7713', 'grad_norm': '1.898', 'learning_rate': '3.585e-06', 'ppl': '2.163', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6256', 'tokens/total': 2621440, 'tokens/trainable': 2448388, 'epoch': '0.07634'}
4%|β–Ž | 20/536 [01:26<21:40, 2.52s/it] 4%|▍ | 21/536 [01:28<21:23, 2.49s/it] 4%|▍ | 22/536 [01:31<20:49, 2.43s/it] 4%|▍ | 23/536 [01:33<20:37, 2.41s/it] 4%|▍ | 24/536 [01:35<20:37, 2.42s/it] 5%|▍ | 25/536 [01:38<20:01, 2.35s/it] {'loss': '0.7452', 'grad_norm': '1.273', 'learning_rate': '4.528e-06', 'ppl': '2.107', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6954', 'tokens/total': 3276800, 'tokens/trainable': 3060985, 'epoch': '0.09542'}
5%|▍ | 25/536 [01:38<20:01, 2.35s/it] 5%|▍ | 26/536 [01:40<19:41, 2.32s/it] 5%|β–Œ | 27/536 [01:42<19:26, 2.29s/it][2026-03-16 19:13:17,483] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-27
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.48s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.48s/it]
5%|β–Œ | 28/536 [03:16<4:11:47, 29.74s/it] 5%|β–Œ | 29/536 [03:18<3:01:39, 21.50s/it] 6%|β–Œ | 30/536 [03:20<2:12:38, 15.73s/it] {'loss': '0.718', 'grad_norm': '0.7695', 'learning_rate': '5.472e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6706', 'tokens/total': 3932160, 'tokens/trainable': 3670695, 'epoch': '0.1145'}
6%|β–Œ | 30/536 [03:20<2:12:38, 15.73s/it] 6%|β–Œ | 31/536 [03:23<1:38:27, 11.70s/it] 6%|β–Œ | 32/536 [03:25<1:14:38, 8.89s/it] 6%|β–Œ | 33/536 [03:27<57:48, 6.90s/it] 6%|β–‹ | 34/536 [03:29<46:07, 5.51s/it] 7%|β–‹ | 35/536 [03:32<37:56, 4.54s/it] {'loss': '0.6699', 'grad_norm': '0.6406', 'learning_rate': '6.415e-06', 'ppl': '1.954', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6770', 'tokens/total': 4587520, 'tokens/trainable': 4284736, 'epoch': '0.1336'}
7%|β–‹ | 35/536 [03:32<37:56, 4.54s/it] 7%|β–‹ | 36/536 [03:34<32:19, 3.88s/it] 7%|β–‹ | 37/536 [03:37<28:45, 3.46s/it] 7%|β–‹ | 38/536 [03:39<26:05, 3.14s/it] 7%|β–‹ | 39/536 [03:41<24:10, 2.92s/it] 7%|β–‹ | 40/536 [03:44<22:31, 2.72s/it] {'loss': '0.6393', 'grad_norm': '0.418', 'learning_rate': '7.358e-06', 'ppl': '1.895', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6668', 'tokens/total': 5242880, 'tokens/trainable': 4896504, 'epoch': '0.1527'}
7%|β–‹ | 40/536 [03:44<22:31, 2.72s/it] 8%|β–Š | 41/536 [03:46<21:24, 2.59s/it] 8%|β–Š | 42/536 [03:48<20:36, 2.50s/it] 8%|β–Š | 43/536 [03:51<20:06, 2.45s/it] 8%|β–Š | 44/536 [03:53<19:38, 2.39s/it] 8%|β–Š | 45/536 [03:55<19:17, 2.36s/it] {'loss': '0.5953', 'grad_norm': '0.3594', 'learning_rate': '8.302e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 5898240, 'tokens/trainable': 5505933, 'epoch': '0.1718'}
8%|β–Š | 45/536 [03:55<19:17, 2.36s/it] 9%|β–Š | 46/536 [03:57<19:17, 2.36s/it] 9%|β–‰ | 47/536 [04:00<19:01, 2.33s/it] 9%|β–‰ | 48/536 [04:02<19:02, 2.34s/it] 9%|β–‰ | 49/536 [04:04<19:02, 2.35s/it] 9%|β–‰ | 50/536 [04:07<18:55, 2.34s/it] {'loss': '0.5779', 'grad_norm': '0.332', 'learning_rate': '9.245e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6574', 'tokens/total': 6553600, 'tokens/trainable': 6116643, 'epoch': '0.1908'}
9%|β–‰ | 50/536 [04:07<18:55, 2.34s/it] 10%|β–‰ | 51/536 [04:09<18:46, 2.32s/it] 10%|β–‰ | 52/536 [04:11<18:33, 2.30s/it] 10%|β–‰ | 53/536 [04:14<18:19, 2.28s/it] 10%|β–ˆ | 54/536 [04:16<18:23, 2.29s/it][2026-03-16 19:15:50,860] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-54
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.65s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.65s/it]
10%|β–ˆ | 55/536 [05:48<3:55:25, 29.37s/it] {'loss': '0.5579', 'grad_norm': '0.2793', 'learning_rate': '1e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4847', 'tokens/total': 7208960, 'tokens/trainable': 6728061, 'epoch': '0.2099'}
10%|β–ˆ | 55/536 [05:48<3:55:25, 29.37s/it] 10%|β–ˆ | 56/536 [05:51<2:50:56, 21.37s/it] 11%|β–ˆ | 57/536 [05:54<2:05:27, 15.72s/it] 11%|β–ˆ | 58/536 [05:56<1:33:09, 11.69s/it] 11%|β–ˆ | 59/536 [05:58<1:11:13, 8.96s/it] 11%|β–ˆ | 60/536 [06:01<55:01, 6.94s/it] {'loss': '0.5485', 'grad_norm': '0.2773', 'learning_rate': '9.996e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 7864320, 'tokens/trainable': 7336524, 'epoch': '0.229'}
11%|β–ˆ | 60/536 [06:01<55:01, 6.94s/it] 11%|β–ˆβ– | 61/536 [06:03<43:45, 5.53s/it] 12%|β–ˆβ– | 62/536 [06:05<36:05, 4.57s/it] 12%|β–ˆβ– | 63/536 [06:08<30:31, 3.87s/it] 12%|β–ˆβ– | 64/536 [06:10<26:38, 3.39s/it] 12%|β–ˆβ– | 65/536 [06:12<24:01, 3.06s/it] {'loss': '0.5385', 'grad_norm': '0.2734', 'learning_rate': '9.987e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6565', 'tokens/total': 8519680, 'tokens/trainable': 7944984, 'epoch': '0.2481'}
12%|β–ˆβ– | 65/536 [06:12<24:01, 3.06s/it] 12%|β–ˆβ– | 66/536 [06:14<22:09, 2.83s/it] 12%|β–ˆβ–Ž | 67/536 [06:17<21:06, 2.70s/it] 13%|β–ˆβ–Ž | 68/536 [06:19<20:07, 2.58s/it] 13%|β–ˆβ–Ž | 69/536 [06:21<19:21, 2.49s/it] 13%|β–ˆβ–Ž | 70/536 [06:24<19:00, 2.45s/it] {'loss': '0.5197', 'grad_norm': '0.2578', 'learning_rate': '9.973e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 9175040, 'tokens/trainable': 8556200, 'epoch': '0.2672'}
13%|β–ˆβ–Ž | 70/536 [06:24<19:00, 2.45s/it] 13%|β–ˆβ–Ž | 71/536 [06:26<18:33, 2.39s/it] 13%|β–ˆβ–Ž | 72/536 [06:28<18:14, 2.36s/it] 14%|β–ˆβ–Ž | 73/536 [06:31<18:01, 2.34s/it] 14%|β–ˆβ– | 74/536 [06:33<18:02, 2.34s/it] 14%|β–ˆβ– | 75/536 [06:35<17:54, 2.33s/it] {'loss': '0.5316', 'grad_norm': '0.3008', 'learning_rate': '9.953e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6728', 'tokens/total': 9830400, 'tokens/trainable': 9167282, 'epoch': '0.2863'}
14%|β–ˆβ– | 75/536 [06:35<17:54, 2.33s/it] 14%|β–ˆβ– | 76/536 [06:38<17:54, 2.34s/it] 14%|β–ˆβ– | 77/536 [06:40<18:05, 2.37s/it] 15%|β–ˆβ– | 78/536 [06:43<18:28, 2.42s/it] 15%|β–ˆβ– | 79/536 [06:45<18:05, 2.37s/it] 15%|β–ˆβ– | 80/536 [06:47<17:47, 2.34s/it] {'loss': '0.5154', 'grad_norm': '0.3164', 'learning_rate': '9.929e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6730', 'tokens/total': 10485760, 'tokens/trainable': 9774908, 'epoch': '0.3053'}
15%|β–ˆβ– | 80/536 [06:47<17:47, 2.34s/it] 15%|β–ˆβ–Œ | 81/536 [06:49<17:39, 2.33s/it][2026-03-16 19:18:24,375] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-81
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.37s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.37s/it]
15%|β–ˆβ–Œ | 82/536 [08:22<3:43:29, 29.54s/it] 15%|β–ˆβ–Œ | 83/536 [08:25<2:41:12, 21.35s/it] 16%|β–ˆβ–Œ | 84/536 [08:27<1:57:43, 15.63s/it] 16%|β–ˆβ–Œ | 85/536 [08:29<1:27:29, 11.64s/it] {'loss': '0.5143', 'grad_norm': '0.2363', 'learning_rate': '9.899e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6604', 'tokens/total': 11141120, 'tokens/trainable': 10388109, 'epoch': '0.3244'}
16%|β–ˆβ–Œ | 85/536 [08:29<1:27:29, 11.64s/it] 16%|β–ˆβ–Œ | 86/536 [08:32<1:06:16, 8.84s/it] 16%|β–ˆβ–Œ | 87/536 [08:34<51:19, 6.86s/it] 16%|β–ˆβ–‹ | 88/536 [08:36<40:55, 5.48s/it] 17%|β–ˆβ–‹ | 89/536 [08:38<33:37, 4.51s/it] 17%|β–ˆβ–‹ | 90/536 [08:41<28:43, 3.86s/it] {'loss': '0.4957', 'grad_norm': '0.2412', 'learning_rate': '9.864e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 11796480, 'tokens/trainable': 10999678, 'epoch': '0.3435'}
17%|β–ˆβ–‹ | 90/536 [08:41<28:43, 3.86s/it] 17%|β–ˆβ–‹ | 91/536 [08:43<25:17, 3.41s/it] 17%|β–ˆβ–‹ | 92/536 [08:45<22:49, 3.08s/it] 17%|β–ˆβ–‹ | 93/536 [08:48<21:12, 2.87s/it] 18%|β–ˆβ–Š | 94/536 [08:50<19:44, 2.68s/it] 18%|β–ˆβ–Š | 95/536 [08:52<19:22, 2.64s/it] {'loss': '0.509', 'grad_norm': '0.2236', 'learning_rate': '9.823e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5974', 'tokens/total': 12451840, 'tokens/trainable': 11609345, 'epoch': '0.3626'}
18%|β–ˆβ–Š | 95/536 [08:52<19:22, 2.64s/it] 18%|β–ˆβ–Š | 96/536 [08:55<18:35, 2.54s/it] 18%|β–ˆβ–Š | 97/536 [08:57<18:01, 2.46s/it] 18%|β–ˆβ–Š | 98/536 [09:00<19:09, 2.62s/it] 18%|β–ˆβ–Š | 99/536 [09:03<19:00, 2.61s/it] 19%|β–ˆβ–Š | 100/536 [09:05<18:16, 2.51s/it] {'loss': '0.4925', 'grad_norm': '0.2451', 'learning_rate': '9.778e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6696', 'tokens/total': 13107200, 'tokens/trainable': 12218448, 'epoch': '0.3817'}
19%|β–ˆβ–Š | 100/536 [09:05<18:16, 2.51s/it] 19%|β–ˆβ–‰ | 101/536 [09:07<17:50, 2.46s/it] 19%|β–ˆβ–‰ | 102/536 [09:09<17:19, 2.40s/it] 19%|β–ˆβ–‰ | 103/536 [09:12<16:59, 2.35s/it] 19%|β–ˆβ–‰ | 104/536 [09:14<16:47, 2.33s/it] 20%|β–ˆβ–‰ | 105/536 [09:16<16:34, 2.31s/it] {'loss': '0.5051', 'grad_norm': '0.25', 'learning_rate': '9.727e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6724', 'tokens/total': 13762560, 'tokens/trainable': 12826468, 'epoch': '0.4008'}
20%|β–ˆβ–‰ | 105/536 [09:16<16:34, 2.31s/it] 20%|β–ˆβ–‰ | 106/536 [09:19<16:28, 2.30s/it] 20%|β–ˆβ–‰ | 107/536 [09:21<16:26, 2.30s/it] 20%|β–ˆβ–ˆ | 108/536 [09:23<16:27, 2.31s/it][2026-03-16 19:20:58,221] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-108
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.13s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.13s/it]
20%|β–ˆβ–ˆ | 109/536 [11:03<3:44:22, 31.53s/it] 21%|β–ˆβ–ˆ | 110/536 [11:05<2:41:43, 22.78s/it] {'loss': '0.4725', 'grad_norm': '0.2266', 'learning_rate': '9.672e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6506', 'tokens/total': 14417920, 'tokens/trainable': 13440042, 'epoch': '0.4198'}
21%|β–ˆβ–ˆ | 110/536 [11:05<2:41:43, 22.78s/it] 21%|β–ˆβ–ˆ | 111/536 [11:07<1:57:41, 16.61s/it] 21%|β–ˆβ–ˆ | 112/536 [11:10<1:26:54, 12.30s/it] 21%|β–ˆβ–ˆ | 113/536 [11:12<1:05:42, 9.32s/it] 21%|β–ˆβ–ˆβ– | 114/536 [11:14<50:52, 7.23s/it] 21%|β–ˆβ–ˆβ– | 115/536 [11:17<40:26, 5.76s/it] {'loss': '0.5004', 'grad_norm': '0.2256', 'learning_rate': '9.612e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6525', 'tokens/total': 15073280, 'tokens/trainable': 14049913, 'epoch': '0.4389'}
21%|β–ˆβ–ˆβ– | 115/536 [11:17<40:26, 5.76s/it] 22%|β–ˆβ–ˆβ– | 116/536 [11:19<33:04, 4.72s/it] 22%|β–ˆβ–ˆβ– | 117/536 [11:22<28:43, 4.11s/it] 22%|β–ˆβ–ˆβ– | 118/536 [11:24<24:56, 3.58s/it] 22%|β–ˆβ–ˆβ– | 119/536 [11:26<22:09, 3.19s/it] 22%|β–ˆβ–ˆβ– | 120/536 [11:29<20:22, 2.94s/it] {'loss': '0.4727', 'grad_norm': '0.248', 'learning_rate': '9.546e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6422', 'tokens/total': 15728640, 'tokens/trainable': 14657396, 'epoch': '0.458'}
22%|β–ˆβ–ˆβ– | 120/536 [11:29<20:22, 2.94s/it] 23%|β–ˆβ–ˆβ–Ž | 121/536 [11:31<19:02, 2.75s/it] 23%|β–ˆβ–ˆβ–Ž | 122/536 [11:33<18:01, 2.61s/it] 23%|β–ˆβ–ˆβ–Ž | 123/536 [11:36<17:25, 2.53s/it] 23%|β–ˆβ–ˆβ–Ž | 124/536 [11:38<17:02, 2.48s/it] 23%|β–ˆβ–ˆβ–Ž | 125/536 [11:40<16:31, 2.41s/it] {'loss': '0.4808', 'grad_norm': '0.2344', 'learning_rate': '9.476e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6830', 'tokens/total': 16384000, 'tokens/trainable': 15266794, 'epoch': '0.4771'}
23%|β–ˆβ–ˆβ–Ž | 125/536 [11:40<16:31, 2.41s/it] 24%|β–ˆβ–ˆβ–Ž | 126/536 [11:43<16:21, 2.39s/it] 24%|β–ˆβ–ˆβ–Ž | 127/536 [11:45<16:21, 2.40s/it] 24%|β–ˆβ–ˆβ– | 128/536 [11:47<16:06, 2.37s/it] 24%|β–ˆβ–ˆβ– | 129/536 [11:50<15:59, 2.36s/it] 24%|β–ˆβ–ˆβ– | 130/536 [11:52<15:57, 2.36s/it] {'loss': '0.4726', 'grad_norm': '0.2451', 'learning_rate': '9.401e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 17039360, 'tokens/trainable': 15876387, 'epoch': '0.4962'}
24%|β–ˆβ–ˆβ– | 130/536 [11:52<15:57, 2.36s/it] 24%|β–ˆβ–ˆβ– | 131/536 [11:54<15:48, 2.34s/it] 25%|β–ˆβ–ˆβ– | 132/536 [11:57<15:37, 2.32s/it] 25%|β–ˆβ–ˆβ– | 133/536 [11:59<15:31, 2.31s/it] 25%|β–ˆβ–ˆβ–Œ | 134/536 [12:01<15:54, 2.37s/it] 25%|β–ˆβ–ˆβ–Œ | 135/536 [12:04<16:09, 2.42s/it] {'loss': '0.4864', 'grad_norm': '0.2344', 'learning_rate': '9.322e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6056', 'tokens/total': 17694720, 'tokens/trainable': 16486440, 'epoch': '0.5153'}
25%|β–ˆβ–ˆβ–Œ | 135/536 [12:04<16:09, 2.42s/it][2026-03-16 19:23:38,988] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-135
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.41s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.41s/it]
25%|β–ˆβ–ˆβ–Œ | 136/536 [13:41<3:26:11, 30.93s/it] 26%|β–ˆβ–ˆβ–Œ | 137/536 [13:45<2:30:52, 22.69s/it] 26%|β–ˆβ–ˆβ–Œ | 138/536 [13:47<1:49:51, 16.56s/it] 26%|β–ˆβ–ˆβ–Œ | 139/536 [13:49<1:21:09, 12.27s/it] 26%|β–ˆβ–ˆβ–Œ | 140/536 [13:52<1:01:08, 9.26s/it] {'loss': '0.4817', 'grad_norm': '0.2275', 'learning_rate': '9.238e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6712', 'tokens/total': 18350080, 'tokens/trainable': 17095060, 'epoch': '0.5344'}
26%|β–ˆβ–ˆβ–Œ | 140/536 [13:52<1:01:08, 9.26s/it] 26%|β–ˆβ–ˆβ–‹ | 141/536 [13:54<47:09, 7.16s/it] 26%|β–ˆβ–ˆβ–‹ | 142/536 [13:56<37:27, 5.70s/it] 27%|β–ˆβ–ˆβ–‹ | 143/536 [13:58<30:36, 4.67s/it] 27%|β–ˆβ–ˆβ–‹ | 144/536 [14:01<25:57, 3.97s/it] 27%|β–ˆβ–ˆβ–‹ | 145/536 [14:03<22:36, 3.47s/it] {'loss': '0.4827', 'grad_norm': '0.249', 'learning_rate': '9.149e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6633', 'tokens/total': 19005440, 'tokens/trainable': 17703368, 'epoch': '0.5534'}
27%|β–ˆβ–ˆβ–‹ | 145/536 [14:03<22:36, 3.47s/it] 27%|β–ˆβ–ˆβ–‹ | 146/536 [14:05<20:19, 3.13s/it] 27%|β–ˆβ–ˆβ–‹ | 147/536 [14:08<18:42, 2.89s/it] 28%|β–ˆβ–ˆβ–Š | 148/536 [14:10<17:44, 2.74s/it] 28%|β–ˆβ–ˆβ–Š | 149/536 [14:12<16:47, 2.60s/it] 28%|β–ˆβ–ˆβ–Š | 150/536 [14:15<16:15, 2.53s/it] {'loss': '0.4892', 'grad_norm': '0.2217', 'learning_rate': '9.057e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6466', 'tokens/total': 19660800, 'tokens/trainable': 18311084, 'epoch': '0.5725'}
28%|β–ˆβ–ˆβ–Š | 150/536 [14:15<16:15, 2.53s/it] 28%|β–ˆβ–ˆβ–Š | 151/536 [14:17<15:49, 2.47s/it] 28%|β–ˆβ–ˆβ–Š | 152/536 [14:20<15:51, 2.48s/it] 29%|β–ˆβ–ˆβ–Š | 153/536 [14:22<16:18, 2.55s/it] 29%|β–ˆβ–ˆβ–Š | 154/536 [14:25<16:18, 2.56s/it] 29%|β–ˆβ–ˆβ–‰ | 155/536 [14:27<16:04, 2.53s/it] {'loss': '0.4618', 'grad_norm': '0.2236', 'learning_rate': '8.959e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6104', 'tokens/total': 20316160, 'tokens/trainable': 18920000, 'epoch': '0.5916'}
29%|β–ˆβ–ˆβ–‰ | 155/536 [14:27<16:04, 2.53s/it] 29%|β–ˆβ–ˆβ–‰ | 156/536 [14:30<15:36, 2.47s/it] 29%|β–ˆβ–ˆβ–‰ | 157/536 [14:32<15:16, 2.42s/it] 29%|β–ˆβ–ˆβ–‰ | 158/536 [14:35<15:24, 2.45s/it] 30%|β–ˆβ–ˆβ–‰ | 159/536 [14:37<15:05, 2.40s/it] 30%|β–ˆβ–ˆβ–‰ | 160/536 [14:39<14:54, 2.38s/it] {'loss': '0.471', 'grad_norm': '0.2793', 'learning_rate': '8.858e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6516', 'tokens/total': 20971520, 'tokens/trainable': 19529720, 'epoch': '0.6107'}
30%|β–ˆβ–ˆβ–‰ | 160/536 [14:39<14:54, 2.38s/it] 30%|β–ˆβ–ˆβ–ˆ | 161/536 [14:41<14:48, 2.37s/it] 30%|β–ˆβ–ˆβ–ˆ | 162/536 [14:44<14:28, 2.32s/it][2026-03-16 19:26:18,649] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-162
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]
30%|β–ˆβ–ˆβ–ˆ | 163/536 [16:21<3:11:06, 30.74s/it] 31%|β–ˆβ–ˆβ–ˆ | 164/536 [16:23<2:17:38, 22.20s/it] 31%|β–ˆβ–ˆβ–ˆ | 165/536 [16:25<1:40:18, 16.22s/it] {'loss': '0.4703', 'grad_norm': '0.2383', 'learning_rate': '8.752e-06', 'ppl': '1.6', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6785', 'tokens/total': 21626880, 'tokens/trainable': 20137712, 'epoch': '0.6298'}
31%|β–ˆβ–ˆβ–ˆ | 165/536 [16:25<1:40:18, 16.22s/it] 31%|β–ˆβ–ˆβ–ˆ | 166/536 [16:28<1:14:13, 12.04s/it] 31%|β–ˆβ–ˆβ–ˆ | 167/536 [16:30<56:04, 9.12s/it] 31%|β–ˆβ–ˆβ–ˆβ– | 168/536 [16:32<43:22, 7.07s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 169/536 [16:34<34:28, 5.64s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 170/536 [16:37<28:13, 4.63s/it] {'loss': '0.4727', 'grad_norm': '0.2139', 'learning_rate': '8.643e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6694', 'tokens/total': 22282240, 'tokens/trainable': 20749040, 'epoch': '0.6489'}
32%|β–ˆβ–ˆβ–ˆβ– | 170/536 [16:37<28:13, 4.63s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 171/536 [16:39<24:07, 3.96s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 172/536 [16:42<21:19, 3.51s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 173/536 [16:44<19:34, 3.24s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 174/536 [16:46<17:46, 2.95s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 175/536 [16:49<16:24, 2.73s/it] {'loss': '0.4856', 'grad_norm': '0.2119', 'learning_rate': '8.53e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6812', 'tokens/total': 22937600, 'tokens/trainable': 21358216, 'epoch': '0.6679'}
33%|β–ˆβ–ˆβ–ˆβ–Ž | 175/536 [16:49<16:24, 2.73s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 176/536 [16:51<15:32, 2.59s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 177/536 [16:54<16:38, 2.78s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 178/536 [16:57<15:52, 2.66s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 179/536 [16:59<15:07, 2.54s/it] 34%|β–ˆβ–ˆβ–ˆβ–Ž | 180/536 [17:01<14:47, 2.49s/it] {'loss': '0.4551', 'grad_norm': '0.2266', 'learning_rate': '8.413e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6375', 'tokens/total': 23592960, 'tokens/trainable': 21963408, 'epoch': '0.687'}
34%|β–ˆβ–ˆβ–ˆβ–Ž | 180/536 [17:01<14:47, 2.49s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 181/536 [17:04<14:25, 2.44s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 182/536 [17:06<14:08, 2.40s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 183/536 [17:08<13:47, 2.34s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 184/536 [17:10<13:42, 2.34s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 185/536 [17:13<13:34, 2.32s/it] {'loss': '0.4654', 'grad_norm': '0.2695', 'learning_rate': '8.292e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6688', 'tokens/total': 24248320, 'tokens/trainable': 22570984, 'epoch': '0.7061'}
35%|β–ˆβ–ˆβ–ˆβ– | 185/536 [17:13<13:34, 2.32s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 186/536 [17:15<13:21, 2.29s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 187/536 [17:17<13:19, 2.29s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 188/536 [17:19<13:11, 2.28s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 189/536 [17:22<13:08, 2.27s/it][2026-03-16 19:28:56,617] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-189
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.04s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.04s/it]
35%|β–ˆβ–ˆβ–ˆβ–Œ | 190/536 [19:01<3:00:49, 31.36s/it] {'loss': '0.4727', 'grad_norm': '0.2285', 'learning_rate': '8.168e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4795', 'tokens/total': 24903680, 'tokens/trainable': 23180680, 'epoch': '0.7252'}
35%|β–ˆβ–ˆβ–ˆβ–Œ | 190/536 [19:01<3:00:49, 31.36s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 191/536 [19:03<2:10:15, 22.65s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 192/536 [19:06<1:35:23, 16.64s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 193/536 [19:08<1:11:08, 12.44s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 194/536 [19:11<53:42, 9.42s/it] 36%|β–ˆβ–ˆβ–ˆβ–‹ | 195/536 [19:13<41:27, 7.30s/it] {'loss': '0.462', 'grad_norm': '0.2158', 'learning_rate': '8.041e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6567', 'tokens/total': 25559040, 'tokens/trainable': 23790730, 'epoch': '0.7443'}
36%|β–ˆβ–ˆβ–ˆβ–‹ | 195/536 [19:13<41:27, 7.30s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 196/536 [19:16<32:54, 5.81s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 197/536 [19:18<27:50, 4.93s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 198/536 [19:21<23:17, 4.13s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 199/536 [19:23<20:04, 3.57s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 200/536 [19:25<17:43, 3.17s/it] {'loss': '0.4676', 'grad_norm': '0.2188', 'learning_rate': '7.91e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6880', 'tokens/total': 26214400, 'tokens/trainable': 24401252, 'epoch': '0.7634'}
37%|β–ˆβ–ˆβ–ˆβ–‹ | 200/536 [19:25<17:43, 3.17s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 201/536 [19:27<16:11, 2.90s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 202/536 [19:30<15:05, 2.71s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 203/536 [19:32<14:15, 2.57s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 204/536 [19:34<13:41, 2.47s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 205/536 [19:37<13:22, 2.43s/it] {'loss': '0.4504', 'grad_norm': '0.2158', 'learning_rate': '7.776e-06', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6617', 'tokens/total': 26869760, 'tokens/trainable': 25010696, 'epoch': '0.7824'}
38%|β–ˆβ–ˆβ–ˆβ–Š | 205/536 [19:37<13:22, 2.43s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 206/536 [19:39<13:08, 2.39s/it] 39%|β–ˆβ–ˆβ–ˆβ–Š | 207/536 [19:41<12:51, 2.34s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 208/536 [19:43<12:41, 2.32s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 209/536 [19:46<12:40, 2.32s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 210/536 [19:48<13:00, 2.39s/it] {'loss': '0.4614', 'grad_norm': '0.2295', 'learning_rate': '7.639e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5994', 'tokens/total': 27525120, 'tokens/trainable': 25617872, 'epoch': '0.8015'}
39%|β–ˆβ–ˆβ–ˆβ–‰ | 210/536 [19:48<13:00, 2.39s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 211/536 [19:51<13:14, 2.44s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 212/536 [19:53<12:58, 2.40s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 213/536 [19:55<12:42, 2.36s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 214/536 [19:58<12:29, 2.33s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 215/536 [20:00<12:22, 2.31s/it] {'loss': '0.477', 'grad_norm': '0.2412', 'learning_rate': '7.5e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6697', 'tokens/total': 28180480, 'tokens/trainable': 26227438, 'epoch': '0.8206'}
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 215/536 [20:00<12:22, 2.31s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 216/536 [20:02<12:22, 2.32s/it][2026-03-16 19:31:37,309] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-216
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.85s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.85s/it]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 217/536 [21:40<2:45:11, 31.07s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 218/536 [21:43<1:59:15, 22.50s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 219/536 [21:45<1:26:49, 16.43s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 220/536 [21:47<1:04:08, 12.18s/it] {'loss': '0.4535', 'grad_norm': '0.2148', 'learning_rate': '7.358e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 28835840, 'tokens/trainable': 26833456, 'epoch': '0.8397'}
41%|β–ˆβ–ˆβ–ˆβ–ˆ | 220/536 [21:47<1:04:08, 12.18s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 221/536 [21:50<48:25, 9.22s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 222/536 [21:52<37:25, 7.15s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 223/536 [21:54<29:44, 5.70s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 224/536 [21:57<24:17, 4.67s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 225/536 [21:59<20:30, 3.96s/it] {'loss': '0.4639', 'grad_norm': '0.2197', 'learning_rate': '7.213e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 29491200, 'tokens/trainable': 27444416, 'epoch': '0.8588'}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 225/536 [21:59<20:30, 3.96s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 226/536 [22:01<17:50, 3.45s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 227/536 [22:03<15:56, 3.10s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 228/536 [22:06<14:40, 2.86s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 229/536 [22:08<14:24, 2.82s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 230/536 [22:11<14:01, 2.75s/it] {'loss': '0.4578', 'grad_norm': '0.2217', 'learning_rate': '7.066e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5816', 'tokens/total': 30146560, 'tokens/trainable': 28048432, 'epoch': '0.8779'}
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 230/536 [22:11<14:01, 2.75s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 231/536 [22:13<13:21, 2.63s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 232/536 [22:16<12:45, 2.52s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 233/536 [22:18<12:29, 2.47s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 234/536 [22:20<12:15, 2.44s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 235/536 [22:23<11:59, 2.39s/it] {'loss': '0.4497', 'grad_norm': '0.2354', 'learning_rate': '6.917e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6580', 'tokens/total': 30801920, 'tokens/trainable': 28655952, 'epoch': '0.8969'}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 235/536 [22:23<11:59, 2.39s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 236/536 [22:25<11:55, 2.39s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 237/536 [22:27<11:47, 2.37s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 238/536 [22:30<11:38, 2.34s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 239/536 [22:32<11:38, 2.35s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 240/536 [22:34<11:23, 2.31s/it] {'loss': '0.4693', 'grad_norm': '0.2275', 'learning_rate': '6.766e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 31457280, 'tokens/trainable': 29262050, 'epoch': '0.916'}
45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 240/536 [22:34<11:23, 2.31s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 241/536 [22:37<11:19, 2.30s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 242/536 [22:39<11:12, 2.29s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 243/536 [22:41<11:10, 2.29s/it][2026-03-16 19:34:16,197] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-243
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.81s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 244/536 [24:21<2:33:31, 31.55s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 245/536 [24:23<1:50:23, 22.76s/it] {'loss': '0.4629', 'grad_norm': '0.2178', 'learning_rate': '6.613e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 32112640, 'tokens/trainable': 29868356, 'epoch': '0.9351'}
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 245/536 [24:23<1:50:23, 22.76s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 246/536 [24:25<1:20:19, 16.62s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 247/536 [24:28<59:24, 12.33s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 248/536 [24:30<45:07, 9.40s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 249/536 [24:33<34:46, 7.27s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 250/536 [24:35<27:30, 5.77s/it] {'loss': '0.474', 'grad_norm': '0.2539', 'learning_rate': '6.458e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6714', 'tokens/total': 32768000, 'tokens/trainable': 30473100, 'epoch': '0.9542'}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 250/536 [24:35<27:30, 5.77s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 251/536 [24:37<22:25, 4.72s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 252/536 [24:39<18:50, 3.98s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 253/536 [24:42<16:27, 3.49s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 254/536 [24:44<14:46, 3.14s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 255/536 [24:46<13:26, 2.87s/it] {'loss': '0.467', 'grad_norm': '0.2305', 'learning_rate': '6.302e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6772', 'tokens/total': 33423360, 'tokens/trainable': 31078478, 'epoch': '0.9733'}
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 255/536 [24:46<13:26, 2.87s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 256/536 [24:49<12:32, 2.69s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 257/536 [24:51<11:59, 2.58s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 258/536 [24:53<11:39, 2.52s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 259/536 [24:56<11:19, 2.45s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 260/536 [24:58<11:02, 2.40s/it] {'loss': '0.4511', 'grad_norm': '0.2148', 'learning_rate': '6.144e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6619', 'tokens/total': 34078720, 'tokens/trainable': 31682612, 'epoch': '0.9924'}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 260/536 [24:58<11:02, 2.40s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 261/536 [25:00<10:50, 2.36s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 262/536 [25:03<10:56, 2.40s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 263/536 [25:06<12:20, 2.71s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 264/536 [25:08<11:41, 2.58s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 265/536 [25:11<11:22, 2.52s/it] {'loss': '0.4682', 'grad_norm': '0.2451', 'learning_rate': '5.985e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6472', 'tokens/total': 34734080, 'tokens/trainable': 32293470, 'epoch': '1.011'}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 265/536 [25:11<11:22, 2.52s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 266/536 [25:13<11:14, 2.50s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 267/536 [25:16<11:28, 2.56s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 268/536 [25:18<11:16, 2.52s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 269/536 [25:21<10:53, 2.45s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 270/536 [25:23<10:39, 2.40s/it] {'loss': '0.461', 'grad_norm': '0.2207', 'learning_rate': '5.826e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6685', 'tokens/total': 35389440, 'tokens/trainable': 32904464, 'epoch': '1.031'}
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 270/536 [25:23<10:39, 2.40s/it][2026-03-16 19:36:59,256] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-270
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.30s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.30s/it]
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 271/536 [27:02<2:18:44, 31.41s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 272/536 [27:04<1:39:50, 22.69s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 273/536 [27:07<1:12:35, 16.56s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 274/536 [27:09<53:33, 12.26s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 275/536 [27:11<40:23, 9.29s/it] {'loss': '0.4545', 'grad_norm': '0.2324', 'learning_rate': '5.665e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6568', 'tokens/total': 36044800, 'tokens/trainable': 33517832, 'epoch': '1.05'}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 275/536 [27:11<40:23, 9.29s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 276/536 [27:15<32:37, 7.53s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 277/536 [27:17<25:45, 5.97s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 278/536 [27:19<20:59, 4.88s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 279/536 [27:22<17:35, 4.11s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 280/536 [27:24<15:12, 3.56s/it] {'loss': '0.447', 'grad_norm': '0.2158', 'learning_rate': '5.503e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 36700160, 'tokens/trainable': 34129632, 'epoch': '1.069'}
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 280/536 [27:24<15:12, 3.56s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 281/536 [27:26<13:35, 3.20s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 282/536 [27:28<12:20, 2.91s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 283/536 [27:31<11:30, 2.73s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 284/536 [27:33<10:54, 2.60s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 285/536 [27:36<10:51, 2.60s/it] {'loss': '0.4378', 'grad_norm': '0.2119', 'learning_rate': '5.341e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5854', 'tokens/total': 37355520, 'tokens/trainable': 34742888, 'epoch': '1.088'}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 285/536 [27:36<10:51, 2.60s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 286/536 [27:38<10:42, 2.57s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 287/536 [27:41<10:26, 2.51s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 288/536 [27:43<10:02, 2.43s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 289/536 [27:45<09:45, 2.37s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 290/536 [27:47<09:40, 2.36s/it] {'loss': '0.4756', 'grad_norm': '0.2246', 'learning_rate': '5.179e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6534', 'tokens/total': 38010880, 'tokens/trainable': 35352848, 'epoch': '1.107'}
54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 290/536 [27:47<09:40, 2.36s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 291/536 [27:50<09:33, 2.34s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 292/536 [27:52<09:27, 2.33s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 293/536 [27:54<09:23, 2.32s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 294/536 [27:57<09:23, 2.33s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 295/536 [27:59<09:21, 2.33s/it] {'loss': '0.4635', 'grad_norm': '0.2188', 'learning_rate': '5.016e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6541', 'tokens/total': 38666240, 'tokens/trainable': 35964736, 'epoch': '1.126'}
55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 295/536 [27:59<09:21, 2.33s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 296/536 [28:01<09:16, 2.32s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 297/536 [28:03<09:09, 2.30s/it][2026-03-16 19:39:38,467] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-297
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 298/536 [29:43<2:05:14, 31.57s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 299/536 [29:46<1:30:06, 22.81s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 300/536 [29:48<1:05:28, 16.65s/it] {'loss': '0.4578', 'grad_norm': '0.2334', 'learning_rate': '4.854e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6804', 'tokens/total': 39321600, 'tokens/trainable': 36579308, 'epoch': '1.145'}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 300/536 [29:48<1:05:28, 16.65s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 301/536 [29:50<48:19, 12.34s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 302/536 [29:53<36:18, 9.31s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 303/536 [29:55<27:59, 7.21s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 304/536 [29:57<22:22, 5.78s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 305/536 [30:00<18:50, 4.89s/it] {'loss': '0.4526', 'grad_norm': '0.2129', 'learning_rate': '4.691e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6214', 'tokens/total': 39976960, 'tokens/trainable': 37187912, 'epoch': '1.164'}
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 305/536 [30:00<18:50, 4.89s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 306/536 [30:02<15:44, 4.11s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 307/536 [30:05<13:31, 3.55s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 308/536 [30:07<12:06, 3.19s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 309/536 [30:09<11:00, 2.91s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 310/536 [30:12<10:18, 2.74s/it] {'loss': '0.4482', 'grad_norm': '0.21', 'learning_rate': '4.529e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6526', 'tokens/total': 40632320, 'tokens/trainable': 37799984, 'epoch': '1.183'}
58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 310/536 [30:12<10:18, 2.74s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 311/536 [30:14<09:49, 2.62s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 312/536 [30:16<09:25, 2.53s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 313/536 [30:18<09:07, 2.45s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 314/536 [30:21<09:33, 2.58s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 315/536 [30:24<09:07, 2.48s/it] {'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '4.368e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6899', 'tokens/total': 41287680, 'tokens/trainable': 38409832, 'epoch': '1.202'}
59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 315/536 [30:24<09:07, 2.48s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 316/536 [30:26<08:55, 2.43s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 317/536 [30:28<08:44, 2.39s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 318/536 [30:31<08:35, 2.36s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 319/536 [30:33<08:25, 2.33s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 320/536 [30:35<08:19, 2.31s/it] {'loss': '0.4539', 'grad_norm': '0.2285', 'learning_rate': '4.207e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6757', 'tokens/total': 41943040, 'tokens/trainable': 39020096, 'epoch': '1.221'}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 320/536 [30:35<08:19, 2.31s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 321/536 [30:37<08:17, 2.31s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 322/536 [30:40<08:09, 2.29s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 323/536 [30:42<08:19, 2.35s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 324/536 [30:45<08:26, 2.39s/it][2026-03-16 19:42:20,100] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-324
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.49s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.49s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 325/536 [32:22<1:48:54, 30.97s/it] {'loss': '0.4481', 'grad_norm': '0.2246', 'learning_rate': '4.046e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6887', 'tokens/total': 42598400, 'tokens/trainable': 39629160, 'epoch': '1.24'}
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 325/536 [32:22<1:48:54, 30.97s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 326/536 [32:24<1:18:13, 22.35s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 327/536 [32:27<56:54, 16.34s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 328/536 [32:29<42:00, 12.12s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 329/536 [32:31<31:43, 9.20s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 330/536 [32:34<24:27, 7.12s/it] {'loss': '0.4542', 'grad_norm': '0.2256', 'learning_rate': '3.887e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6731', 'tokens/total': 43253760, 'tokens/trainable': 40237288, 'epoch': '1.26'}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 330/536 [32:34<24:27, 7.12s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 331/536 [32:36<19:21, 5.67s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 332/536 [32:39<16:57, 4.99s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 333/536 [32:42<14:06, 4.17s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 334/536 [32:44<12:07, 3.60s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 335/536 [32:46<10:43, 3.20s/it] {'loss': '0.4412', 'grad_norm': '0.2539', 'learning_rate': '3.729e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6745', 'tokens/total': 43909120, 'tokens/trainable': 40848032, 'epoch': '1.279'}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 335/536 [32:46<10:43, 3.20s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 336/536 [32:48<09:46, 2.93s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 337/536 [32:51<09:05, 2.74s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 338/536 [32:53<08:37, 2.61s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 339/536 [32:55<08:14, 2.51s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 340/536 [32:58<08:04, 2.47s/it] {'loss': '0.4615', 'grad_norm': '0.2217', 'learning_rate': '3.573e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6370', 'tokens/total': 44564480, 'tokens/trainable': 41457624, 'epoch': '1.298'}
63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 340/536 [32:58<08:04, 2.47s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 341/536 [33:00<08:05, 2.49s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 342/536 [33:03<07:51, 2.43s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 343/536 [33:05<07:49, 2.43s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 344/536 [33:08<07:51, 2.46s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 345/536 [33:10<07:38, 2.40s/it] {'loss': '0.4599', 'grad_norm': '0.2188', 'learning_rate': '3.418e-06', 'ppl': '1.584', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6795', 'tokens/total': 45219840, 'tokens/trainable': 42069272, 'epoch': '1.317'}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 345/536 [33:10<07:38, 2.40s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 346/536 [33:12<07:29, 2.37s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 347/536 [33:14<07:26, 2.36s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 348/536 [33:17<07:20, 2.34s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 349/536 [33:19<07:12, 2.32s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 350/536 [33:21<07:09, 2.31s/it] {'loss': '0.4499', 'grad_norm': '0.2148', 'learning_rate': '3.264e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6691', 'tokens/total': 45875200, 'tokens/trainable': 42681132, 'epoch': '1.336'}
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 350/536 [33:21<07:09, 2.31s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 351/536 [33:24<07:04, 2.29s/it][2026-03-16 19:44:58,459] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-351
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.55s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.55s/it]
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 352/536 [35:02<1:35:17, 31.07s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 353/536 [35:04<1:08:31, 22.47s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 354/536 [35:06<49:47, 16.42s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 355/536 [35:09<36:49, 12.21s/it] {'loss': '0.4529', 'grad_norm': '0.249', 'learning_rate': '3.113e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6371', 'tokens/total': 46530560, 'tokens/trainable': 43292904, 'epoch': '1.355'}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 355/536 [35:09<36:49, 12.21s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 356/536 [35:11<27:37, 9.21s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 357/536 [35:13<21:14, 7.12s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 358/536 [35:16<16:48, 5.67s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 359/536 [35:18<13:56, 4.73s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 360/536 [35:21<11:51, 4.04s/it] {'loss': '0.4461', 'grad_norm': '0.2207', 'learning_rate': '2.963e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6150', 'tokens/total': 47185920, 'tokens/trainable': 43900272, 'epoch': '1.374'}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 360/536 [35:21<11:51, 4.04s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 361/536 [35:23<10:14, 3.51s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 362/536 [35:25<09:24, 3.25s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 363/536 [35:28<08:33, 2.97s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 364/536 [35:30<07:52, 2.75s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 365/536 [35:32<07:23, 2.59s/it] {'loss': '0.4581', 'grad_norm': '0.3555', 'learning_rate': '2.816e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6767', 'tokens/total': 47841280, 'tokens/trainable': 44509872, 'epoch': '1.393'}
68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 365/536 [35:32<07:23, 2.59s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 366/536 [35:34<07:04, 2.50s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 367/536 [35:37<06:49, 2.42s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 368/536 [35:39<06:39, 2.38s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 369/536 [35:41<06:33, 2.36s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 370/536 [35:44<06:33, 2.37s/it] {'loss': '0.4483', 'grad_norm': '0.2109', 'learning_rate': '2.671e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6359', 'tokens/total': 48496640, 'tokens/trainable': 45121444, 'epoch': '1.412'}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 370/536 [35:44<06:33, 2.37s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 371/536 [35:46<06:27, 2.35s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 372/536 [35:48<06:25, 2.35s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 373/536 [35:51<06:17, 2.31s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 374/536 [35:53<06:12, 2.30s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 375/536 [35:55<06:13, 2.32s/it] {'loss': '0.4475', 'grad_norm': '0.2617', 'learning_rate': '2.528e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6505', 'tokens/total': 49152000, 'tokens/trainable': 45733296, 'epoch': '1.431'}
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 375/536 [35:55<06:13, 2.32s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 376/536 [35:58<06:09, 2.31s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 377/536 [36:00<06:26, 2.43s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 378/536 [36:03<06:17, 2.39s/it][2026-03-16 19:47:37,290] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-378
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.77s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.77s/it]
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 379/536 [37:39<1:20:25, 30.73s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 380/536 [37:42<57:44, 22.21s/it] {'loss': '0.4467', 'grad_norm': '0.208', 'learning_rate': '2.388e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6639', 'tokens/total': 49807360, 'tokens/trainable': 46341868, 'epoch': '1.45'}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 380/536 [37:42<57:44, 22.21s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 381/536 [37:44<41:54, 16.22s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 382/536 [37:46<31:01, 12.09s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 383/536 [37:49<23:20, 9.16s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 384/536 [37:51<17:57, 7.09s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 385/536 [37:53<14:15, 5.66s/it] {'loss': '0.4373', 'grad_norm': '0.2129', 'learning_rate': '2.251e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6487', 'tokens/total': 50462720, 'tokens/trainable': 46948144, 'epoch': '1.469'}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 385/536 [37:53<14:15, 5.66s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 386/536 [37:56<11:39, 4.67s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 387/536 [37:58<09:46, 3.94s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 388/536 [38:00<08:31, 3.45s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 389/536 [38:03<07:40, 3.14s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 390/536 [38:05<07:00, 2.88s/it] {'loss': '0.452', 'grad_norm': '0.2314', 'learning_rate': '2.117e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 51118080, 'tokens/trainable': 47558056, 'epoch': '1.489'}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 390/536 [38:05<07:00, 2.88s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 391/536 [38:08<07:22, 3.05s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 392/536 [38:11<06:49, 2.85s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 393/536 [38:13<06:22, 2.68s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 394/536 [38:15<06:02, 2.55s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 395/536 [38:18<06:01, 2.56s/it] {'loss': '0.4435', 'grad_norm': '0.2139', 'learning_rate': '1.985e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5877', 'tokens/total': 51773440, 'tokens/trainable': 48168520, 'epoch': '1.508'}
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 395/536 [38:18<06:01, 2.56s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 396/536 [38:20<05:47, 2.48s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 397/536 [38:22<05:35, 2.42s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 398/536 [38:25<05:27, 2.37s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 399/536 [38:27<05:22, 2.35s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 400/536 [38:29<05:21, 2.37s/it] {'loss': '0.4444', 'grad_norm': '0.2236', 'learning_rate': '1.857e-06', 'ppl': '1.56', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6406', 'tokens/total': 52428800, 'tokens/trainable': 48779416, 'epoch': '1.527'}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 400/536 [38:29<05:21, 2.37s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 401/536 [38:32<05:15, 2.34s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 402/536 [38:34<05:10, 2.32s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 403/536 [38:36<05:10, 2.34s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 404/536 [38:39<05:07, 2.33s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 405/536 [38:41<05:03, 2.32s/it] {'loss': '0.4557', 'grad_norm': '0.2324', 'learning_rate': '1.732e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6719', 'tokens/total': 53084160, 'tokens/trainable': 49387032, 'epoch': '1.546'}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 405/536 [38:41<05:03, 2.32s/it][2026-03-16 19:50:15,755] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-405
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.46s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.46s/it]
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 406/536 [40:19<1:06:58, 30.91s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 407/536 [40:21<48:00, 22.33s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 408/536 [40:23<34:49, 16.33s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 409/536 [40:25<25:39, 12.12s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 410/536 [40:28<19:38, 9.36s/it] {'loss': '0.4617', 'grad_norm': '0.2168', 'learning_rate': '1.611e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5250', 'tokens/total': 53739520, 'tokens/trainable': 49995344, 'epoch': '1.565'}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 410/536 [40:28<19:38, 9.36s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 411/536 [40:31<15:03, 7.23s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 412/536 [40:33<11:54, 5.76s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 413/536 [40:36<09:53, 4.83s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 414/536 [40:38<08:16, 4.07s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 415/536 [40:40<07:07, 3.53s/it] {'loss': '0.4492', 'grad_norm': '0.2217', 'learning_rate': '1.493e-06', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6671', 'tokens/total': 54394880, 'tokens/trainable': 50603264, 'epoch': '1.584'}
77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 415/536 [40:40<07:07, 3.53s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 416/536 [40:43<06:19, 3.17s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 417/536 [40:45<05:45, 2.90s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 418/536 [40:47<05:26, 2.77s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 419/536 [40:50<05:18, 2.73s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 420/536 [40:52<05:02, 2.61s/it] {'loss': '0.4522', 'grad_norm': '0.2676', 'learning_rate': '1.379e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 55050240, 'tokens/trainable': 51213612, 'epoch': '1.603'}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 420/536 [40:52<05:02, 2.61s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 421/536 [40:55<04:49, 2.52s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 422/536 [40:57<04:40, 2.46s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 423/536 [40:59<04:40, 2.49s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 424/536 [41:02<04:29, 2.41s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 425/536 [41:04<04:21, 2.36s/it] {'loss': '0.4414', 'grad_norm': '0.2168', 'learning_rate': '1.269e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6755', 'tokens/total': 55705600, 'tokens/trainable': 51819592, 'epoch': '1.622'}
79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 425/536 [41:04<04:21, 2.36s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 426/536 [41:06<04:17, 2.34s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 427/536 [41:08<04:13, 2.32s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 428/536 [41:11<04:08, 2.30s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 429/536 [41:13<04:07, 2.31s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 430/536 [41:15<04:04, 2.31s/it] {'loss': '0.4532', 'grad_norm': '0.2217', 'learning_rate': '1.163e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6642', 'tokens/total': 56360960, 'tokens/trainable': 52431520, 'epoch': '1.641'}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 430/536 [41:15<04:04, 2.31s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 431/536 [41:18<04:10, 2.38s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 432/536 [41:20<04:04, 2.35s/it][2026-03-16 19:52:55,057] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-432
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.45s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.45s/it]
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 433/536 [42:58<53:14, 31.01s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 434/536 [43:00<38:05, 22.41s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 435/536 [43:03<27:33, 16.37s/it] {'loss': '0.4605', 'grad_norm': '0.3574', 'learning_rate': '1.061e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 57016320, 'tokens/trainable': 53041944, 'epoch': '1.66'}
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 435/536 [43:03<27:33, 16.37s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 436/536 [43:05<20:20, 12.20s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 437/536 [43:07<15:11, 9.20s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 438/536 [43:10<11:41, 7.16s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 439/536 [43:12<09:11, 5.68s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 440/536 [43:14<07:29, 4.68s/it] {'loss': '0.446', 'grad_norm': '0.2119', 'learning_rate': '9.626e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6513', 'tokens/total': 57671680, 'tokens/trainable': 53647180, 'epoch': '1.679'}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 440/536 [43:14<07:29, 4.68s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 441/536 [43:17<06:23, 4.04s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 442/536 [43:19<05:31, 3.53s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 443/536 [43:21<04:53, 3.16s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 444/536 [43:24<04:26, 2.90s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 445/536 [43:26<04:04, 2.69s/it] {'loss': '0.4299', 'grad_norm': '0.2188', 'learning_rate': '8.688e-07', 'ppl': '1.537', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6951', 'tokens/total': 58327040, 'tokens/trainable': 54256432, 'epoch': '1.698'}
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 445/536 [43:26<04:04, 2.69s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 446/536 [43:28<03:52, 2.58s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 447/536 [43:31<03:41, 2.49s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 448/536 [43:33<03:32, 2.41s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 449/536 [43:35<03:30, 2.42s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 450/536 [43:38<03:23, 2.37s/it] {'loss': '0.4583', 'grad_norm': '0.2188', 'learning_rate': '7.794e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6777', 'tokens/total': 58982400, 'tokens/trainable': 54863148, 'epoch': '1.718'}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 450/536 [43:38<03:23, 2.37s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 451/536 [43:40<03:28, 2.46s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 452/536 [43:43<03:24, 2.44s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 453/536 [43:45<03:19, 2.40s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 454/536 [43:47<03:15, 2.38s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 455/536 [43:50<03:12, 2.38s/it] {'loss': '0.4523', 'grad_norm': '0.2119', 'learning_rate': '6.945e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6454', 'tokens/total': 59637760, 'tokens/trainable': 55471580, 'epoch': '1.737'}
85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 455/536 [43:50<03:12, 2.38s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 456/536 [43:52<03:15, 2.44s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 457/536 [43:54<03:10, 2.41s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 458/536 [43:57<03:11, 2.46s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 459/536 [43:59<03:05, 2.41s/it][2026-03-16 19:55:34,637] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-459
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]
86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 460/536 [45:37<39:11, 30.94s/it] {'loss': '0.4523', 'grad_norm': '0.2148', 'learning_rate': '6.141e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6711', 'tokens/total': 60293120, 'tokens/trainable': 56081832, 'epoch': '1.756'}
86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 460/536 [45:37<39:11, 30.94s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 461/536 [45:39<27:55, 22.34s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 462/536 [45:41<20:06, 16.30s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 463/536 [45:44<14:42, 12.09s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 464/536 [45:46<10:58, 9.14s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 465/536 [45:48<08:21, 7.06s/it] {'loss': '0.4461', 'grad_norm': '0.4551', 'learning_rate': '5.383e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6903', 'tokens/total': 60948480, 'tokens/trainable': 56694240, 'epoch': '1.775'}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 465/536 [45:48<08:21, 7.06s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 466/536 [45:50<06:33, 5.62s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 467/536 [45:53<05:23, 4.69s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 468/536 [45:55<04:29, 3.97s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 469/536 [45:57<03:50, 3.44s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 470/536 [46:01<03:45, 3.42s/it] {'loss': '0.4341', 'grad_norm': '0.208', 'learning_rate': '4.673e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4512', 'tokens/total': 61603840, 'tokens/trainable': 57301404, 'epoch': '1.794'}
88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 470/536 [46:01<03:45, 3.42s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 471/536 [46:03<03:20, 3.09s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 472/536 [46:05<03:02, 2.86s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 473/536 [46:08<02:50, 2.70s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 474/536 [46:10<02:39, 2.58s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 475/536 [46:12<02:34, 2.54s/it] {'loss': '0.4627', 'grad_norm': '0.2461', 'learning_rate': '4.011e-07', 'ppl': '1.588', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6249', 'tokens/total': 62259200, 'tokens/trainable': 57910216, 'epoch': '1.813'}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 475/536 [46:12<02:34, 2.54s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 476/536 [46:15<02:26, 2.45s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 477/536 [46:17<02:24, 2.45s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 478/536 [46:20<02:25, 2.51s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 479/536 [46:22<02:19, 2.45s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 480/536 [46:24<02:13, 2.39s/it] {'loss': '0.4538', 'grad_norm': '0.2178', 'learning_rate': '3.397e-07', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6635', 'tokens/total': 62914560, 'tokens/trainable': 58517712, 'epoch': '1.832'}
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 480/536 [46:24<02:13, 2.39s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 481/536 [46:27<02:09, 2.36s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 482/536 [46:29<02:05, 2.33s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 483/536 [46:31<02:03, 2.33s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 484/536 [46:34<02:01, 2.33s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 485/536 [46:36<02:04, 2.44s/it] {'loss': '0.4395', 'grad_norm': '0.208', 'learning_rate': '2.833e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6504', 'tokens/total': 63569920, 'tokens/trainable': 59125608, 'epoch': '1.851'}
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 485/536 [46:36<02:04, 2.44s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 486/536 [46:39<01:59, 2.39s/it][2026-03-16 19:58:13,418] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-486
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 487/536 [48:16<25:09, 30.81s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 488/536 [48:18<17:48, 22.26s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 489/536 [48:20<12:44, 16.26s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 490/536 [48:24<09:32, 12.45s/it] {'loss': '0.4478', 'grad_norm': '0.2236', 'learning_rate': '2.318e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4290', 'tokens/total': 64225280, 'tokens/trainable': 59733412, 'epoch': '1.87'}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 490/536 [48:24<09:32, 12.45s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 491/536 [48:26<07:02, 9.39s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 492/536 [48:28<05:19, 7.26s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 493/536 [48:31<04:08, 5.78s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 494/536 [48:33<03:22, 4.82s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 495/536 [48:36<02:47, 4.08s/it] {'loss': '0.4362', 'grad_norm': '0.2129', 'learning_rate': '1.854e-07', 'ppl': '1.547', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6524', 'tokens/total': 64880640, 'tokens/trainable': 60339904, 'epoch': '1.889'}
92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 495/536 [48:36<02:47, 4.08s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 496/536 [48:38<02:25, 3.64s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 497/536 [48:40<02:06, 3.23s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 498/536 [48:43<01:52, 2.96s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 499/536 [48:45<01:42, 2.76s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 500/536 [48:47<01:34, 2.62s/it] {'loss': '0.4656', 'grad_norm': '0.2217', 'learning_rate': '1.441e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6649', 'tokens/total': 65536000, 'tokens/trainable': 60945412, 'epoch': '1.908'}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 500/536 [48:47<01:34, 2.62s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 501/536 [48:50<01:29, 2.55s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 502/536 [48:52<01:25, 2.52s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 503/536 [48:54<01:20, 2.44s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 504/536 [48:57<01:16, 2.39s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 505/536 [48:59<01:13, 2.36s/it] {'loss': '0.4466', 'grad_norm': '0.2129', 'learning_rate': '1.079e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6658', 'tokens/total': 66191360, 'tokens/trainable': 61550936, 'epoch': '1.927'}
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 505/536 [48:59<01:13, 2.36s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 506/536 [49:01<01:09, 2.33s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 507/536 [49:04<01:07, 2.31s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 508/536 [49:06<01:04, 2.31s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 509/536 [49:08<01:04, 2.37s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 510/536 [49:11<01:01, 2.35s/it] {'loss': '0.4754', 'grad_norm': '0.2168', 'learning_rate': '7.691e-08', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 66846720, 'tokens/trainable': 62157088, 'epoch': '1.947'}
95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 510/536 [49:11<01:01, 2.35s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 511/536 [49:13<00:58, 2.34s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 512/536 [49:15<00:55, 2.32s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 513/536 [49:18<00:53, 2.31s/it][2026-03-16 20:00:52,618] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-513
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.79s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.79s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 514/536 [50:57<11:33, 31.50s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 515/536 [51:00<07:58, 22.77s/it] {'loss': '0.4548', 'grad_norm': '0.2217', 'learning_rate': '5.11e-08', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6342', 'tokens/total': 67502080, 'tokens/trainable': 62762592, 'epoch': '1.966'}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 515/536 [51:00<07:58, 22.77s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 516/536 [51:02<05:32, 16.65s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 517/536 [51:04<03:54, 12.32s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 518/536 [51:06<02:47, 9.30s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 519/536 [51:09<02:02, 7.18s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 520/536 [51:11<01:32, 5.76s/it] {'loss': '0.4544', 'grad_norm': '0.2314', 'learning_rate': '3.054e-08', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6125', 'tokens/total': 68157440, 'tokens/trainable': 63366800, 'epoch': '1.985'}
97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 520/536 [51:11<01:32, 5.76s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 521/536 [51:13<01:10, 4.72s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 522/536 [51:16<00:55, 3.98s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 523/536 [51:18<00:45, 3.47s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 524/536 [51:20<00:38, 3.19s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 525/536 [51:24<00:35, 3.21s/it] {'loss': '0.4504', 'grad_norm': '0.2246', 'learning_rate': '1.522e-08', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6753', 'tokens/total': 68812800, 'tokens/trainable': 63975056, 'epoch': '2.004'}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 525/536 [51:24<00:35, 3.21s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 526/536 [51:26<00:29, 2.92s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 527/536 [51:28<00:24, 2.75s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 528/536 [51:31<00:21, 2.71s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 529/536 [51:33<00:18, 2.60s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 530/536 [51:36<00:15, 2.52s/it] {'loss': '0.4546', 'grad_norm': '0.2109', 'learning_rate': '5.182e-09', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6575', 'tokens/total': 69468160, 'tokens/trainable': 64586448, 'epoch': '2.023'}
99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 530/536 [51:36<00:15, 2.52s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 531/536 [51:38<00:12, 2.45s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 532/536 [51:40<00:09, 2.40s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 533/536 [51:43<00:07, 2.40s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 534/536 [51:45<00:04, 2.39s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 535/536 [51:47<00:02, 2.42s/it] {'loss': '0.4493', 'grad_norm': '0.2314', 'learning_rate': '4.231e-10', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6115', 'tokens/total': 70123520, 'tokens/trainable': 65199364, 'epoch': '2.042'}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 535/536 [51:47<00:02, 2.42s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [51:50<00:00, 2.37s/it][2026-03-16 20:03:25,941] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-536
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.68s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.68s/it]
{'train_runtime': '3210', 'train_samples_per_second': '2.672', 'train_steps_per_second': '0.167', 'train_loss': '0.4897', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'epoch': '2.046', 'tokens/train_per_sec_per_gpu': '6757'}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [53:26<00:00, 2.37s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [53:26<00:00, 5.98s/it]
[2026-03-16 20:04:52,263] [INFO] [axolotl.train.save_trained_model:237] [PID:213] Training completed! Saving trained model to ./outputs/qwen3-sft-stmt-tk/.
[2026-03-16 20:05:01,009] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.53s/it] Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.53s/it]
[2026-03-16 20:05:19,091] [INFO] [axolotl.train.save_trained_model:351] [PID:213] Model successfully saved to ./outputs/qwen3-sft-stmt-tk/