[2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB ()
[2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "qwen3-sft-stmt-tk.yml",
  "base_model": "Qwen/Qwen3-8B",
  "base_model_config": "Qwen/Qwen3-8B",
  "batch_size": 16,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 8,
    "n_node": 1
  },
  "chat_template": "qwen3",
  "chat_template_kwargs": {
    "enable_thinking": false
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 8,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 192,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "xiaolesu/lean4-sft-stmt-tk",
      "split": "train",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "evals_per_epoch": 10,
  "experimental_skip_move_to_device": true,
  "flex_attention": true,
  "flex_attn_compile_kwargs": {
    "dynamic": false,
    "mode": "max-autotune-no-cudagraphs"
  },
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "activation_checkpointing": true,
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "fsdp_version": 2,
    "offload_params": false,
    "reshard_after_forward": true,
    "state_dict_type": "FULL_STATE_DICT",
    "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
  },
  "fsdp_version": 2,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "include_tkps": true,
  "learning_rate": 1e-05,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 5,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "qwen3",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/qwen3-sft-stmt-tk/",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.05,
  "save_total_limit": 3,
  "saves_per_epoch": 10,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3-8B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "qwen3-8b-tk-run1",
  "wandb_project": "qwen3-sft-stmt-tk",
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 8
}
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets...
[2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Fetching 0 files: 0it [00:00, ?it/s]Fetching 0 files: 0it [00:00, ?it/s]
[2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159
Dropping Invalid Sequences (<None or >8192) (num_proc=192):   0%|          | 0/11192 [00:00<?, ? examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):   1%|          | 59/11192 [00:02<06:34, 28.25 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):   3%|▎         | 295/11192 [00:02<01:02, 175.65 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):   6%|▌         | 649/11192 [00:02<00:23, 453.06 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):   8%|▊         | 885/11192 [00:02<00:16, 634.46 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  10%|█         | 1121/11192 [00:02<00:11, 849.04 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  13%|█▎        | 1416/11192 [00:02<00:08, 1166.00 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  15%|█▌        | 1711/11192 [00:02<00:06, 1480.17 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  18%|█▊        | 2006/11192 [00:02<00:05, 1697.58 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  21%|██        | 2301/11192 [00:02<00:04, 1949.74 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  23%|██▎       | 2596/11192 [00:03<00:04, 2145.10 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  26%|██▌       | 2891/11192 [00:03<00:03, 2324.57 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  29%|██▉       | 3245/11192 [00:03<00:03, 2566.75 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192):  70%|██████▉   | 7828/11192 [00:03<00:00, 14035.00 examples/s]Dropping Invalid Sequences (<None or >8192) (num_proc=192): 100%|██████████| 11192/11192 [00:04<00:00, 2753.84 examples/s]
[2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192])
Drop Samples with Zero Trainable Tokens (num_proc=192):   0%|          | 0/10830 [00:00<?, ? examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   1%|          | 57/10830 [00:02<06:27, 27.78 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   3%|▎         | 285/10830 [00:02<01:00, 173.64 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   4%|▍         | 456/10830 [00:02<00:34, 299.77 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   6%|▋         | 684/10830 [00:02<00:20, 506.62 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   8%|▊         | 912/10830 [00:02<00:13, 736.95 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  11%|█         | 1140/10830 [00:02<00:10, 947.17 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  13%|█▎        | 1368/10830 [00:02<00:08, 1094.03 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  15%|█▍        | 1596/10830 [00:02<00:07, 1269.49 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  17%|█▋        | 1824/10830 [00:02<00:06, 1437.65 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  19%|█▉        | 2052/10830 [00:03<00:05, 1614.63 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  21%|██        | 2280/10830 [00:03<00:05, 1635.72 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  23%|██▎       | 2508/10830 [00:03<00:04, 1732.21 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  25%|██▌       | 2736/10830 [00:03<00:04, 1721.60 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  27%|██▋       | 2964/10830 [00:03<00:04, 1703.27 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  29%|██▉       | 3192/10830 [00:03<00:04, 1798.77 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  32%|███▏      | 3477/10830 [00:03<00:03, 1958.86 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  34%|███▍      | 3705/10830 [00:03<00:03, 2037.08 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  36%|███▋      | 3933/10830 [00:04<00:03, 2067.96 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  38%|███▊      | 4161/10830 [00:04<00:03, 2091.19 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  41%|████      | 4389/10830 [00:04<00:05, 1127.36 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  43%|████▎     | 4670/10830 [00:04<00:04, 1385.39 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  45%|████▌     | 4894/10830 [00:04<00:04, 1432.10 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  69%|██████▉   | 7526/10830 [00:04<00:00, 6499.14 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|██████████| 10830/10830 [00:05<00:00, 1931.57 examples/s]
Add position_id column (Sample Packing) (num_proc=192):   0%|          | 0/10830 [00:00<?, ? examples/s]Add position_id column (Sample Packing) (num_proc=192):   1%|          | 57/10830 [00:02<06:33, 27.40 examples/s]Add position_id column (Sample Packing) (num_proc=192):   2%|▏         | 228/10830 [00:02<01:18, 135.14 examples/s]Add position_id column (Sample Packing) (num_proc=192):   4%|▍         | 456/10830 [00:02<00:33, 310.31 examples/s]Add position_id column (Sample Packing) (num_proc=192):   8%|▊         | 912/10830 [00:02<00:14, 692.10 examples/s]Add position_id column (Sample Packing) (num_proc=192):  11%|█         | 1140/10830 [00:02<00:11, 858.26 examples/s]Add position_id column (Sample Packing) (num_proc=192):  13%|█▎        | 1368/10830 [00:02<00:09, 1027.56 examples/s]Add position_id column (Sample Packing) (num_proc=192):  15%|█▍        | 1596/10830 [00:02<00:07, 1182.55 examples/s]Add position_id column (Sample Packing) (num_proc=192):  17%|█▋        | 1881/10830 [00:02<00:06, 1425.26 examples/s]Add position_id column (Sample Packing) (num_proc=192):  20%|██        | 2166/10830 [00:03<00:05, 1604.97 examples/s]Add position_id column (Sample Packing) (num_proc=192):  22%|██▏       | 2394/10830 [00:03<00:04, 1738.29 examples/s]Add position_id column (Sample Packing) (num_proc=192):  25%|██▍       | 2679/10830 [00:03<00:04, 1951.23 examples/s]Add position_id column (Sample Packing) (num_proc=192):  63%|██████▎   | 6854/10830 [00:03<00:00, 11681.66 examples/s]Add position_id column (Sample Packing) (num_proc=192): 100%|██████████| 10830/10830 [00:04<00:00, 2621.72 examples/s]
Saving the dataset (0/42 shards):   0%|          | 0/10830 [00:00<?, ? examples/s]Saving the dataset (0/42 shards):   2%|▏         | 258/10830 [00:00<00:22, 464.02 examples/s]Saving the dataset (1/42 shards):   2%|▏         | 258/10830 [00:00<00:22, 464.02 examples/s]Saving the dataset (2/42 shards):   7%|▋         | 774/10830 [00:00<00:21, 464.02 examples/s]Saving the dataset (3/42 shards):   7%|▋         | 774/10830 [00:00<00:21, 464.02 examples/s]Saving the dataset (4/42 shards):  14%|█▍        | 1548/10830 [00:00<00:20, 464.02 examples/s]Saving the dataset (5/42 shards):  14%|█▍        | 1548/10830 [00:00<00:20, 464.02 examples/s]Saving the dataset (6/42 shards):  17%|█▋        | 1806/10830 [00:00<00:19, 464.02 examples/s]Saving the dataset (7/42 shards):  19%|█▉        | 2064/10830 [00:00<00:18, 464.02 examples/s]Saving the dataset (8/42 shards):  21%|██▏       | 2322/10830 [00:00<00:18, 464.02 examples/s]Saving the dataset (9/42 shards):  21%|██▏       | 2322/10830 [00:00<00:18, 464.02 examples/s]Saving the dataset (10/42 shards):  26%|██▌       | 2838/10830 [00:00<00:17, 464.02 examples/s]Saving the dataset (11/42 shards):  29%|██▊       | 3096/10830 [00:00<00:16, 464.02 examples/s]Saving the dataset (12/42 shards):  31%|███       | 3354/10830 [00:00<00:16, 464.02 examples/s]Saving the dataset (13/42 shards):  33%|███▎      | 3612/10830 [00:00<00:15, 464.02 examples/s]Saving the dataset (14/42 shards):  33%|███▎      | 3612/10830 [00:00<00:15, 464.02 examples/s]Saving the dataset (15/42 shards):  38%|███▊      | 4128/10830 [00:00<00:14, 464.02 examples/s]Saving the dataset (16/42 shards):  40%|████      | 4386/10830 [00:00<00:13, 464.02 examples/s]Saving the dataset (17/42 shards):  40%|████      | 4386/10830 [00:00<00:13, 464.02 examples/s]Saving the dataset (18/42 shards):  45%|████▌     | 4902/10830 [00:00<00:12, 464.02 examples/s]Saving the dataset (19/42 shards):  48%|████▊     | 5160/10830 [00:00<00:12, 464.02 examples/s]Saving the dataset (20/42 shards):  48%|████▊     | 5160/10830 [00:00<00:12, 464.02 examples/s]Saving the dataset (21/42 shards):  52%|█████▏    | 5676/10830 [00:00<00:11, 464.02 examples/s]Saving the dataset (22/42 shards):  52%|█████▏    | 5676/10830 [00:00<00:11, 464.02 examples/s]Saving the dataset (23/42 shards):  55%|█████▍    | 5934/10830 [00:00<00:10, 464.02 examples/s]Saving the dataset (24/42 shards):  57%|█████▋    | 6192/10830 [00:00<00:09, 464.02 examples/s]Saving the dataset (25/42 shards):  64%|██████▍   | 6966/10830 [00:00<00:08, 464.02 examples/s]Saving the dataset (26/42 shards):  64%|██████▍   | 6966/10830 [00:00<00:08, 464.02 examples/s]Saving the dataset (27/42 shards):  64%|██████▍   | 6966/10830 [00:00<00:08, 464.02 examples/s]Saving the dataset (28/42 shards):  67%|██████▋   | 7224/10830 [00:00<00:07, 464.02 examples/s]Saving the dataset (29/42 shards):  74%|███████▍  | 7998/10830 [00:00<00:06, 464.02 examples/s]Saving the dataset (30/42 shards):  74%|███████▍  | 7998/10830 [00:00<00:06, 464.02 examples/s]Saving the dataset (31/42 shards):  74%|███████▍  | 7998/10830 [00:00<00:06, 464.02 examples/s]Saving the dataset (32/42 shards):  79%|███████▊  | 8514/10830 [00:00<00:04, 464.02 examples/s]Saving the dataset (33/42 shards):  81%|████████  | 8772/10830 [00:00<00:04, 464.02 examples/s]Saving the dataset (34/42 shards):  81%|████████  | 8772/10830 [00:00<00:04, 464.02 examples/s]Saving the dataset (35/42 shards):  83%|████████▎ | 9030/10830 [00:00<00:03, 464.02 examples/s]Saving the dataset (36/42 shards):  88%|████████▊ | 9545/10830 [00:00<00:02, 464.02 examples/s]Saving the dataset (37/42 shards):  88%|████████▊ | 9545/10830 [00:00<00:02, 464.02 examples/s]Saving the dataset (38/42 shards):  91%|█████████ | 9802/10830 [00:00<00:02, 464.02 examples/s]Saving the dataset (39/42 shards):  95%|█████████▌| 10316/10830 [00:00<00:01, 464.02 examples/s]Saving the dataset (40/42 shards):  95%|█████████▌| 10316/10830 [00:00<00:01, 464.02 examples/s]Saving the dataset (41/42 shards):  98%|█████████▊| 10573/10830 [00:00<00:00, 464.02 examples/s]Saving the dataset (42/42 shards): 100%|██████████| 10830/10830 [00:00<00:00, 464.02 examples/s]Saving the dataset (42/42 shards): 100%|██████████| 10830/10830 [00:00<00:00, 16314.56 examples/s]
[2026-03-16 19:08:54,045] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:213] total_num_tokens: 33_957_071
[2026-03-16 19:08:54,340] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:213] `total_supervised_tokens: 32_028_150`
[2026-03-16 19:08:55,893] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.7050187587738037
[2026-03-16 19:11:05,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2148, 2146, 2148, 2145, 2146, 2146, 2148, 2145]
[2026-03-16 19:11:06,172] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:213] data_loader_len: 268
[2026-03-16 19:11:06,189] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:213] sample_packing_eff_est across ranks: [0.9646614789962769, 0.9657852649688721, 0.9646614789962769, 0.9657852649688721, 0.9648860096931458, 0.9648860096931458, 0.9653354287147522, 0.9657852649688721]
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:213] sample_packing_eff_est: 0.97
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:213] total_num_steps: 536
[2026-03-16 19:11:06,192] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:213] Maximum number of steps set at 536
[2026-03-16 19:11:06,242] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:213] loading tokenizer... Qwen/Qwen3-8B
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model
[2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing...
[2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 9671.84it/s]
[2026-03-16 19:11:09,882] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:359] [PID:213] Converting modules to torch.bfloat16
[2026-03-16 19:11:09,885] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:213] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-16 19:11:11,696] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:213] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.Multiple deprecation warnings due to FSDP2 conversion:
sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-03-16 19:11:12,192] [INFO] [axolotl.train.save_initial_configs:417] [PID:213] Pre-saving tokenizer to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,283] [INFO] [axolotl.train.save_initial_configs:422] [PID:213] Pre-saving model config to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,286] [INFO] [axolotl.train.execute_training:218] [PID:213] Starting trainer...
[2026-03-16 19:11:14,793] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.9547648429870605
[2026-03-16 19:11:14,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2103, 2104, 2104, 2104, 2103, 2104, 2106, 2104]
[2026-03-16 19:11:15,013] [INFO] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:34] [PID:213] Broadcasting full state dict to all ranks...
[2026-03-16 19:11:22,269] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:86] [PID:213] Time taken to load full state dict: 7.26 seconds
[2026-03-16 19:11:22,270] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.log_gpu_memory_usage:127] [PID:213] Memory usage after broadcasting full state dict 3.067GB (+3.067GB allocated, +3.178GB reserved)
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
wandb: Currently logged in as: suxiaole0223 (suxiaole) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run kje10pck
wandb: Tracking run with wandb version 0.25.1
wandb: Run data is saved locally in /workspace/axolotl-workspace/wandb/run-20260316_191122-kje10pck
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run qwen3-8b-tk-run1
wandb: ⭐️ View project at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk
wandb: 🚀 View run at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk/runs/kje10pck
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-16 19:11:25,554] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:213] The Axolotl config has been saved to the WandB run under files.
  0%|          | 0/536 [00:00<?, ?it/s][2026-03-16 19:11:57,210] [WARNING] [py.warnings._showwarnmsg:110] [PID:213] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
  _warn_once(

  0%|          | 1/536 [00:40<6:03:21, 40.75s/it]  0%|          | 2/536 [00:43<2:42:00, 18.20s/it]  1%|          | 3/536 [00:45<1:37:15, 10.95s/it]  1%|          | 4/536 [00:47<1:07:23,  7.60s/it]  1%|          | 5/536 [00:50<50:28,  5.70s/it]                                                 {'loss': '0.8667', 'grad_norm': '2.609', 'learning_rate': '7.547e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6531', 'tokens/total': 655360, 'tokens/trainable': 611049, 'epoch': '0.01908'}
  1%|          | 5/536 [00:50<50:28,  5.70s/it]  1%|          | 6/536 [00:52<40:15,  4.56s/it]  1%|▏         | 7/536 [00:55<34:02,  3.86s/it]  1%|▏         | 8/536 [00:57<30:00,  3.41s/it]  2%|▏         | 9/536 [00:59<26:45,  3.05s/it]  2%|▏         | 10/536 [01:02<24:45,  2.82s/it]                                                {'loss': '0.8307', 'grad_norm': '2.5', 'learning_rate': '1.698e-06', 'ppl': '2.295', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6647', 'tokens/total': 1310720, 'tokens/trainable': 1224548, 'epoch': '0.03817'}
  2%|▏         | 10/536 [01:02<24:45,  2.82s/it]  2%|▏         | 11/536 [01:04<23:13,  2.65s/it]  2%|▏         | 12/536 [01:06<22:04,  2.53s/it]  2%|▏         | 13/536 [01:08<21:32,  2.47s/it]  3%|▎         | 14/536 [01:11<21:27,  2.47s/it]  3%|▎         | 15/536 [01:13<21:28,  2.47s/it]                                                {'loss': '0.8487', 'grad_norm': '2.453', 'learning_rate': '2.642e-06', 'ppl': '2.337', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6160', 'tokens/total': 1966080, 'tokens/trainable': 1834432, 'epoch': '0.05725'}
  3%|▎         | 15/536 [01:13<21:28,  2.47s/it]  3%|▎         | 16/536 [01:16<21:18,  2.46s/it]  3%|▎         | 17/536 [01:18<20:51,  2.41s/it]  3%|▎         | 18/536 [01:20<20:44,  2.40s/it]  4%|▎         | 19/536 [01:23<21:59,  2.55s/it]  4%|▎         | 20/536 [01:26<21:40,  2.52s/it]                                                {'loss': '0.7713', 'grad_norm': '1.898', 'learning_rate': '3.585e-06', 'ppl': '2.163', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6256', 'tokens/total': 2621440, 'tokens/trainable': 2448388, 'epoch': '0.07634'}
  4%|▎         | 20/536 [01:26<21:40,  2.52s/it]  4%|▍         | 21/536 [01:28<21:23,  2.49s/it]  4%|▍         | 22/536 [01:31<20:49,  2.43s/it]  4%|▍         | 23/536 [01:33<20:37,  2.41s/it]  4%|▍         | 24/536 [01:35<20:37,  2.42s/it]  5%|▍         | 25/536 [01:38<20:01,  2.35s/it]                                                {'loss': '0.7452', 'grad_norm': '1.273', 'learning_rate': '4.528e-06', 'ppl': '2.107', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6954', 'tokens/total': 3276800, 'tokens/trainable': 3060985, 'epoch': '0.09542'}
  5%|▍         | 25/536 [01:38<20:01,  2.35s/it]  5%|▍         | 26/536 [01:40<19:41,  2.32s/it]  5%|▌         | 27/536 [01:42<19:26,  2.29s/it][2026-03-16 19:13:17,483] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-27

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.48s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.48s/it]
  5%|▌         | 28/536 [03:16<4:11:47, 29.74s/it]  5%|▌         | 29/536 [03:18<3:01:39, 21.50s/it]  6%|▌         | 30/536 [03:20<2:12:38, 15.73s/it]                                                  {'loss': '0.718', 'grad_norm': '0.7695', 'learning_rate': '5.472e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6706', 'tokens/total': 3932160, 'tokens/trainable': 3670695, 'epoch': '0.1145'}
  6%|▌         | 30/536 [03:20<2:12:38, 15.73s/it]  6%|▌         | 31/536 [03:23<1:38:27, 11.70s/it]  6%|▌         | 32/536 [03:25<1:14:38,  8.89s/it]  6%|▌         | 33/536 [03:27<57:48,  6.90s/it]    6%|▋         | 34/536 [03:29<46:07,  5.51s/it]  7%|▋         | 35/536 [03:32<37:56,  4.54s/it]                                                {'loss': '0.6699', 'grad_norm': '0.6406', 'learning_rate': '6.415e-06', 'ppl': '1.954', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6770', 'tokens/total': 4587520, 'tokens/trainable': 4284736, 'epoch': '0.1336'}
  7%|▋         | 35/536 [03:32<37:56,  4.54s/it]  7%|▋         | 36/536 [03:34<32:19,  3.88s/it]  7%|▋         | 37/536 [03:37<28:45,  3.46s/it]  7%|▋         | 38/536 [03:39<26:05,  3.14s/it]  7%|▋         | 39/536 [03:41<24:10,  2.92s/it]  7%|▋         | 40/536 [03:44<22:31,  2.72s/it]                                                {'loss': '0.6393', 'grad_norm': '0.418', 'learning_rate': '7.358e-06', 'ppl': '1.895', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6668', 'tokens/total': 5242880, 'tokens/trainable': 4896504, 'epoch': '0.1527'}
  7%|▋         | 40/536 [03:44<22:31,  2.72s/it]  8%|▊         | 41/536 [03:46<21:24,  2.59s/it]  8%|▊         | 42/536 [03:48<20:36,  2.50s/it]  8%|▊         | 43/536 [03:51<20:06,  2.45s/it]  8%|▊         | 44/536 [03:53<19:38,  2.39s/it]  8%|▊         | 45/536 [03:55<19:17,  2.36s/it]                                                {'loss': '0.5953', 'grad_norm': '0.3594', 'learning_rate': '8.302e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 5898240, 'tokens/trainable': 5505933, 'epoch': '0.1718'}
  8%|▊         | 45/536 [03:55<19:17,  2.36s/it]  9%|▊         | 46/536 [03:57<19:17,  2.36s/it]  9%|▉         | 47/536 [04:00<19:01,  2.33s/it]  9%|▉         | 48/536 [04:02<19:02,  2.34s/it]  9%|▉         | 49/536 [04:04<19:02,  2.35s/it]  9%|▉         | 50/536 [04:07<18:55,  2.34s/it]                                                {'loss': '0.5779', 'grad_norm': '0.332', 'learning_rate': '9.245e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6574', 'tokens/total': 6553600, 'tokens/trainable': 6116643, 'epoch': '0.1908'}
  9%|▉         | 50/536 [04:07<18:55,  2.34s/it] 10%|▉         | 51/536 [04:09<18:46,  2.32s/it] 10%|▉         | 52/536 [04:11<18:33,  2.30s/it] 10%|▉         | 53/536 [04:14<18:19,  2.28s/it] 10%|█         | 54/536 [04:16<18:23,  2.29s/it][2026-03-16 19:15:50,860] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-54

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.65s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.65s/it]
 10%|█         | 55/536 [05:48<3:55:25, 29.37s/it]                                                  {'loss': '0.5579', 'grad_norm': '0.2793', 'learning_rate': '1e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4847', 'tokens/total': 7208960, 'tokens/trainable': 6728061, 'epoch': '0.2099'}
 10%|█         | 55/536 [05:48<3:55:25, 29.37s/it] 10%|█         | 56/536 [05:51<2:50:56, 21.37s/it] 11%|█         | 57/536 [05:54<2:05:27, 15.72s/it] 11%|█         | 58/536 [05:56<1:33:09, 11.69s/it] 11%|█         | 59/536 [05:58<1:11:13,  8.96s/it] 11%|█         | 60/536 [06:01<55:01,  6.94s/it]                                                  {'loss': '0.5485', 'grad_norm': '0.2773', 'learning_rate': '9.996e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 7864320, 'tokens/trainable': 7336524, 'epoch': '0.229'}
 11%|█         | 60/536 [06:01<55:01,  6.94s/it] 11%|█▏        | 61/536 [06:03<43:45,  5.53s/it] 12%|█▏        | 62/536 [06:05<36:05,  4.57s/it] 12%|█▏        | 63/536 [06:08<30:31,  3.87s/it] 12%|█▏        | 64/536 [06:10<26:38,  3.39s/it] 12%|█▏        | 65/536 [06:12<24:01,  3.06s/it]                                                {'loss': '0.5385', 'grad_norm': '0.2734', 'learning_rate': '9.987e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6565', 'tokens/total': 8519680, 'tokens/trainable': 7944984, 'epoch': '0.2481'}
 12%|█▏        | 65/536 [06:12<24:01,  3.06s/it] 12%|█▏        | 66/536 [06:14<22:09,  2.83s/it] 12%|█▎        | 67/536 [06:17<21:06,  2.70s/it] 13%|█▎        | 68/536 [06:19<20:07,  2.58s/it] 13%|█▎        | 69/536 [06:21<19:21,  2.49s/it] 13%|█▎        | 70/536 [06:24<19:00,  2.45s/it]                                                {'loss': '0.5197', 'grad_norm': '0.2578', 'learning_rate': '9.973e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 9175040, 'tokens/trainable': 8556200, 'epoch': '0.2672'}
 13%|█▎        | 70/536 [06:24<19:00,  2.45s/it] 13%|█▎        | 71/536 [06:26<18:33,  2.39s/it] 13%|█▎        | 72/536 [06:28<18:14,  2.36s/it] 14%|█▎        | 73/536 [06:31<18:01,  2.34s/it] 14%|█▍        | 74/536 [06:33<18:02,  2.34s/it] 14%|█▍        | 75/536 [06:35<17:54,  2.33s/it]                                                {'loss': '0.5316', 'grad_norm': '0.3008', 'learning_rate': '9.953e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6728', 'tokens/total': 9830400, 'tokens/trainable': 9167282, 'epoch': '0.2863'}
 14%|█▍        | 75/536 [06:35<17:54,  2.33s/it] 14%|█▍        | 76/536 [06:38<17:54,  2.34s/it] 14%|█▍        | 77/536 [06:40<18:05,  2.37s/it] 15%|█▍        | 78/536 [06:43<18:28,  2.42s/it] 15%|█▍        | 79/536 [06:45<18:05,  2.37s/it] 15%|█▍        | 80/536 [06:47<17:47,  2.34s/it]                                                {'loss': '0.5154', 'grad_norm': '0.3164', 'learning_rate': '9.929e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6730', 'tokens/total': 10485760, 'tokens/trainable': 9774908, 'epoch': '0.3053'}
 15%|█▍        | 80/536 [06:47<17:47,  2.34s/it] 15%|█▌        | 81/536 [06:49<17:39,  2.33s/it][2026-03-16 19:18:24,375] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-81

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.37s/it][AWriting model shards: 100%|██████████| 1/1 [00:17<00:00, 17.37s/it]
 15%|█▌        | 82/536 [08:22<3:43:29, 29.54s/it] 15%|█▌        | 83/536 [08:25<2:41:12, 21.35s/it] 16%|█▌        | 84/536 [08:27<1:57:43, 15.63s/it] 16%|█▌        | 85/536 [08:29<1:27:29, 11.64s/it]                                                  {'loss': '0.5143', 'grad_norm': '0.2363', 'learning_rate': '9.899e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6604', 'tokens/total': 11141120, 'tokens/trainable': 10388109, 'epoch': '0.3244'}
 16%|█▌        | 85/536 [08:29<1:27:29, 11.64s/it] 16%|█▌        | 86/536 [08:32<1:06:16,  8.84s/it] 16%|█▌        | 87/536 [08:34<51:19,  6.86s/it]   16%|█▋        | 88/536 [08:36<40:55,  5.48s/it] 17%|█▋        | 89/536 [08:38<33:37,  4.51s/it] 17%|█▋        | 90/536 [08:41<28:43,  3.86s/it]                                                {'loss': '0.4957', 'grad_norm': '0.2412', 'learning_rate': '9.864e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 11796480, 'tokens/trainable': 10999678, 'epoch': '0.3435'}
 17%|█▋        | 90/536 [08:41<28:43,  3.86s/it] 17%|█▋        | 91/536 [08:43<25:17,  3.41s/it] 17%|█▋        | 92/536 [08:45<22:49,  3.08s/it] 17%|█▋        | 93/536 [08:48<21:12,  2.87s/it] 18%|█▊        | 94/536 [08:50<19:44,  2.68s/it] 18%|█▊        | 95/536 [08:52<19:22,  2.64s/it]                                                {'loss': '0.509', 'grad_norm': '0.2236', 'learning_rate': '9.823e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5974', 'tokens/total': 12451840, 'tokens/trainable': 11609345, 'epoch': '0.3626'}
 18%|█▊        | 95/536 [08:52<19:22,  2.64s/it] 18%|█▊        | 96/536 [08:55<18:35,  2.54s/it] 18%|█▊        | 97/536 [08:57<18:01,  2.46s/it] 18%|█▊        | 98/536 [09:00<19:09,  2.62s/it] 18%|█▊        | 99/536 [09:03<19:00,  2.61s/it] 19%|█▊        | 100/536 [09:05<18:16,  2.51s/it]                                                 {'loss': '0.4925', 'grad_norm': '0.2451', 'learning_rate': '9.778e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6696', 'tokens/total': 13107200, 'tokens/trainable': 12218448, 'epoch': '0.3817'}
 19%|█▊        | 100/536 [09:05<18:16,  2.51s/it] 19%|█▉        | 101/536 [09:07<17:50,  2.46s/it] 19%|█▉        | 102/536 [09:09<17:19,  2.40s/it] 19%|█▉        | 103/536 [09:12<16:59,  2.35s/it] 19%|█▉        | 104/536 [09:14<16:47,  2.33s/it] 20%|█▉        | 105/536 [09:16<16:34,  2.31s/it]                                                 {'loss': '0.5051', 'grad_norm': '0.25', 'learning_rate': '9.727e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6724', 'tokens/total': 13762560, 'tokens/trainable': 12826468, 'epoch': '0.4008'}
 20%|█▉        | 105/536 [09:16<16:34,  2.31s/it] 20%|█▉        | 106/536 [09:19<16:28,  2.30s/it] 20%|█▉        | 107/536 [09:21<16:26,  2.30s/it] 20%|██        | 108/536 [09:23<16:27,  2.31s/it][2026-03-16 19:20:58,221] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-108

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.13s/it][AWriting model shards: 100%|██████████| 1/1 [00:17<00:00, 17.13s/it]
 20%|██        | 109/536 [11:03<3:44:22, 31.53s/it] 21%|██        | 110/536 [11:05<2:41:43, 22.78s/it]                                                   {'loss': '0.4725', 'grad_norm': '0.2266', 'learning_rate': '9.672e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6506', 'tokens/total': 14417920, 'tokens/trainable': 13440042, 'epoch': '0.4198'}
 21%|██        | 110/536 [11:05<2:41:43, 22.78s/it] 21%|██        | 111/536 [11:07<1:57:41, 16.61s/it] 21%|██        | 112/536 [11:10<1:26:54, 12.30s/it] 21%|██        | 113/536 [11:12<1:05:42,  9.32s/it] 21%|██▏       | 114/536 [11:14<50:52,  7.23s/it]   21%|██▏       | 115/536 [11:17<40:26,  5.76s/it]                                                 {'loss': '0.5004', 'grad_norm': '0.2256', 'learning_rate': '9.612e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6525', 'tokens/total': 15073280, 'tokens/trainable': 14049913, 'epoch': '0.4389'}
 21%|██▏       | 115/536 [11:17<40:26,  5.76s/it] 22%|██▏       | 116/536 [11:19<33:04,  4.72s/it] 22%|██▏       | 117/536 [11:22<28:43,  4.11s/it] 22%|██▏       | 118/536 [11:24<24:56,  3.58s/it] 22%|██▏       | 119/536 [11:26<22:09,  3.19s/it] 22%|██▏       | 120/536 [11:29<20:22,  2.94s/it]                                                 {'loss': '0.4727', 'grad_norm': '0.248', 'learning_rate': '9.546e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6422', 'tokens/total': 15728640, 'tokens/trainable': 14657396, 'epoch': '0.458'}
 22%|██▏       | 120/536 [11:29<20:22,  2.94s/it] 23%|██▎       | 121/536 [11:31<19:02,  2.75s/it] 23%|██▎       | 122/536 [11:33<18:01,  2.61s/it] 23%|██▎       | 123/536 [11:36<17:25,  2.53s/it] 23%|██▎       | 124/536 [11:38<17:02,  2.48s/it] 23%|██▎       | 125/536 [11:40<16:31,  2.41s/it]                                                 {'loss': '0.4808', 'grad_norm': '0.2344', 'learning_rate': '9.476e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6830', 'tokens/total': 16384000, 'tokens/trainable': 15266794, 'epoch': '0.4771'}
 23%|██▎       | 125/536 [11:40<16:31,  2.41s/it] 24%|██▎       | 126/536 [11:43<16:21,  2.39s/it] 24%|██▎       | 127/536 [11:45<16:21,  2.40s/it] 24%|██▍       | 128/536 [11:47<16:06,  2.37s/it] 24%|██▍       | 129/536 [11:50<15:59,  2.36s/it] 24%|██▍       | 130/536 [11:52<15:57,  2.36s/it]                                                 {'loss': '0.4726', 'grad_norm': '0.2451', 'learning_rate': '9.401e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 17039360, 'tokens/trainable': 15876387, 'epoch': '0.4962'}
 24%|██▍       | 130/536 [11:52<15:57,  2.36s/it] 24%|██▍       | 131/536 [11:54<15:48,  2.34s/it] 25%|██▍       | 132/536 [11:57<15:37,  2.32s/it] 25%|██▍       | 133/536 [11:59<15:31,  2.31s/it] 25%|██▌       | 134/536 [12:01<15:54,  2.37s/it] 25%|██▌       | 135/536 [12:04<16:09,  2.42s/it]                                                 {'loss': '0.4864', 'grad_norm': '0.2344', 'learning_rate': '9.322e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6056', 'tokens/total': 17694720, 'tokens/trainable': 16486440, 'epoch': '0.5153'}
 25%|██▌       | 135/536 [12:04<16:09,  2.42s/it][2026-03-16 19:23:38,988] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-135

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.41s/it][AWriting model shards: 100%|██████████| 1/1 [00:17<00:00, 17.41s/it]
 25%|██▌       | 136/536 [13:41<3:26:11, 30.93s/it] 26%|██▌       | 137/536 [13:45<2:30:52, 22.69s/it] 26%|██▌       | 138/536 [13:47<1:49:51, 16.56s/it] 26%|██▌       | 139/536 [13:49<1:21:09, 12.27s/it] 26%|██▌       | 140/536 [13:52<1:01:08,  9.26s/it]                                                   {'loss': '0.4817', 'grad_norm': '0.2275', 'learning_rate': '9.238e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6712', 'tokens/total': 18350080, 'tokens/trainable': 17095060, 'epoch': '0.5344'}
 26%|██▌       | 140/536 [13:52<1:01:08,  9.26s/it] 26%|██▋       | 141/536 [13:54<47:09,  7.16s/it]   26%|██▋       | 142/536 [13:56<37:27,  5.70s/it] 27%|██▋       | 143/536 [13:58<30:36,  4.67s/it] 27%|██▋       | 144/536 [14:01<25:57,  3.97s/it] 27%|██▋       | 145/536 [14:03<22:36,  3.47s/it]                                                 {'loss': '0.4827', 'grad_norm': '0.249', 'learning_rate': '9.149e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6633', 'tokens/total': 19005440, 'tokens/trainable': 17703368, 'epoch': '0.5534'}
 27%|██▋       | 145/536 [14:03<22:36,  3.47s/it] 27%|██▋       | 146/536 [14:05<20:19,  3.13s/it] 27%|██▋       | 147/536 [14:08<18:42,  2.89s/it] 28%|██▊       | 148/536 [14:10<17:44,  2.74s/it] 28%|██▊       | 149/536 [14:12<16:47,  2.60s/it] 28%|██▊       | 150/536 [14:15<16:15,  2.53s/it]                                                 {'loss': '0.4892', 'grad_norm': '0.2217', 'learning_rate': '9.057e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6466', 'tokens/total': 19660800, 'tokens/trainable': 18311084, 'epoch': '0.5725'}
 28%|██▊       | 150/536 [14:15<16:15,  2.53s/it] 28%|██▊       | 151/536 [14:17<15:49,  2.47s/it] 28%|██▊       | 152/536 [14:20<15:51,  2.48s/it] 29%|██▊       | 153/536 [14:22<16:18,  2.55s/it] 29%|██▊       | 154/536 [14:25<16:18,  2.56s/it] 29%|██▉       | 155/536 [14:27<16:04,  2.53s/it]                                                 {'loss': '0.4618', 'grad_norm': '0.2236', 'learning_rate': '8.959e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6104', 'tokens/total': 20316160, 'tokens/trainable': 18920000, 'epoch': '0.5916'}
 29%|██▉       | 155/536 [14:27<16:04,  2.53s/it] 29%|██▉       | 156/536 [14:30<15:36,  2.47s/it] 29%|██▉       | 157/536 [14:32<15:16,  2.42s/it] 29%|██▉       | 158/536 [14:35<15:24,  2.45s/it] 30%|██▉       | 159/536 [14:37<15:05,  2.40s/it] 30%|██▉       | 160/536 [14:39<14:54,  2.38s/it]                                                 {'loss': '0.471', 'grad_norm': '0.2793', 'learning_rate': '8.858e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6516', 'tokens/total': 20971520, 'tokens/trainable': 19529720, 'epoch': '0.6107'}
 30%|██▉       | 160/536 [14:39<14:54,  2.38s/it] 30%|███       | 161/536 [14:41<14:48,  2.37s/it] 30%|███       | 162/536 [14:44<14:28,  2.32s/it][2026-03-16 19:26:18,649] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-162

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.63s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.63s/it]
 30%|███       | 163/536 [16:21<3:11:06, 30.74s/it] 31%|███       | 164/536 [16:23<2:17:38, 22.20s/it] 31%|███       | 165/536 [16:25<1:40:18, 16.22s/it]                                                   {'loss': '0.4703', 'grad_norm': '0.2383', 'learning_rate': '8.752e-06', 'ppl': '1.6', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6785', 'tokens/total': 21626880, 'tokens/trainable': 20137712, 'epoch': '0.6298'}
 31%|███       | 165/536 [16:25<1:40:18, 16.22s/it] 31%|███       | 166/536 [16:28<1:14:13, 12.04s/it] 31%|███       | 167/536 [16:30<56:04,  9.12s/it]   31%|███▏      | 168/536 [16:32<43:22,  7.07s/it] 32%|███▏      | 169/536 [16:34<34:28,  5.64s/it] 32%|███▏      | 170/536 [16:37<28:13,  4.63s/it]                                                 {'loss': '0.4727', 'grad_norm': '0.2139', 'learning_rate': '8.643e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6694', 'tokens/total': 22282240, 'tokens/trainable': 20749040, 'epoch': '0.6489'}
 32%|███▏      | 170/536 [16:37<28:13,  4.63s/it] 32%|███▏      | 171/536 [16:39<24:07,  3.96s/it] 32%|███▏      | 172/536 [16:42<21:19,  3.51s/it] 32%|███▏      | 173/536 [16:44<19:34,  3.24s/it] 32%|███▏      | 174/536 [16:46<17:46,  2.95s/it] 33%|███▎      | 175/536 [16:49<16:24,  2.73s/it]                                                 {'loss': '0.4856', 'grad_norm': '0.2119', 'learning_rate': '8.53e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6812', 'tokens/total': 22937600, 'tokens/trainable': 21358216, 'epoch': '0.6679'}
 33%|███▎      | 175/536 [16:49<16:24,  2.73s/it] 33%|███▎      | 176/536 [16:51<15:32,  2.59s/it] 33%|███▎      | 177/536 [16:54<16:38,  2.78s/it] 33%|███▎      | 178/536 [16:57<15:52,  2.66s/it] 33%|███▎      | 179/536 [16:59<15:07,  2.54s/it] 34%|███▎      | 180/536 [17:01<14:47,  2.49s/it]                                                 {'loss': '0.4551', 'grad_norm': '0.2266', 'learning_rate': '8.413e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6375', 'tokens/total': 23592960, 'tokens/trainable': 21963408, 'epoch': '0.687'}
 34%|███▎      | 180/536 [17:01<14:47,  2.49s/it] 34%|███▍      | 181/536 [17:04<14:25,  2.44s/it] 34%|███▍      | 182/536 [17:06<14:08,  2.40s/it] 34%|███▍      | 183/536 [17:08<13:47,  2.34s/it] 34%|███▍      | 184/536 [17:10<13:42,  2.34s/it] 35%|███▍      | 185/536 [17:13<13:34,  2.32s/it]                                                 {'loss': '0.4654', 'grad_norm': '0.2695', 'learning_rate': '8.292e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6688', 'tokens/total': 24248320, 'tokens/trainable': 22570984, 'epoch': '0.7061'}
 35%|███▍      | 185/536 [17:13<13:34,  2.32s/it] 35%|███▍      | 186/536 [17:15<13:21,  2.29s/it] 35%|███▍      | 187/536 [17:17<13:19,  2.29s/it] 35%|███▌      | 188/536 [17:19<13:11,  2.28s/it] 35%|███▌      | 189/536 [17:22<13:08,  2.27s/it][2026-03-16 19:28:56,617] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-189

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.04s/it][AWriting model shards: 100%|██████████| 1/1 [00:17<00:00, 17.04s/it]
 35%|███▌      | 190/536 [19:01<3:00:49, 31.36s/it]                                                   {'loss': '0.4727', 'grad_norm': '0.2285', 'learning_rate': '8.168e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4795', 'tokens/total': 24903680, 'tokens/trainable': 23180680, 'epoch': '0.7252'}
 35%|███▌      | 190/536 [19:01<3:00:49, 31.36s/it] 36%|███▌      | 191/536 [19:03<2:10:15, 22.65s/it] 36%|███▌      | 192/536 [19:06<1:35:23, 16.64s/it] 36%|███▌      | 193/536 [19:08<1:11:08, 12.44s/it] 36%|███▌      | 194/536 [19:11<53:42,  9.42s/it]   36%|███▋      | 195/536 [19:13<41:27,  7.30s/it]                                                 {'loss': '0.462', 'grad_norm': '0.2158', 'learning_rate': '8.041e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6567', 'tokens/total': 25559040, 'tokens/trainable': 23790730, 'epoch': '0.7443'}
 36%|███▋      | 195/536 [19:13<41:27,  7.30s/it] 37%|███▋      | 196/536 [19:16<32:54,  5.81s/it] 37%|███▋      | 197/536 [19:18<27:50,  4.93s/it] 37%|███▋      | 198/536 [19:21<23:17,  4.13s/it] 37%|███▋      | 199/536 [19:23<20:04,  3.57s/it] 37%|███▋      | 200/536 [19:25<17:43,  3.17s/it]                                                 {'loss': '0.4676', 'grad_norm': '0.2188', 'learning_rate': '7.91e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6880', 'tokens/total': 26214400, 'tokens/trainable': 24401252, 'epoch': '0.7634'}
 37%|███▋      | 200/536 [19:25<17:43,  3.17s/it] 38%|███▊      | 201/536 [19:27<16:11,  2.90s/it] 38%|███▊      | 202/536 [19:30<15:05,  2.71s/it] 38%|███▊      | 203/536 [19:32<14:15,  2.57s/it] 38%|███▊      | 204/536 [19:34<13:41,  2.47s/it] 38%|███▊      | 205/536 [19:37<13:22,  2.43s/it]                                                 {'loss': '0.4504', 'grad_norm': '0.2158', 'learning_rate': '7.776e-06', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6617', 'tokens/total': 26869760, 'tokens/trainable': 25010696, 'epoch': '0.7824'}
 38%|███▊      | 205/536 [19:37<13:22,  2.43s/it] 38%|███▊      | 206/536 [19:39<13:08,  2.39s/it] 39%|███▊      | 207/536 [19:41<12:51,  2.34s/it] 39%|███▉      | 208/536 [19:43<12:41,  2.32s/it] 39%|███▉      | 209/536 [19:46<12:40,  2.32s/it] 39%|███▉      | 210/536 [19:48<13:00,  2.39s/it]                                                 {'loss': '0.4614', 'grad_norm': '0.2295', 'learning_rate': '7.639e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5994', 'tokens/total': 27525120, 'tokens/trainable': 25617872, 'epoch': '0.8015'}
 39%|███▉      | 210/536 [19:48<13:00,  2.39s/it] 39%|███▉      | 211/536 [19:51<13:14,  2.44s/it] 40%|███▉      | 212/536 [19:53<12:58,  2.40s/it] 40%|███▉      | 213/536 [19:55<12:42,  2.36s/it] 40%|███▉      | 214/536 [19:58<12:29,  2.33s/it] 40%|████      | 215/536 [20:00<12:22,  2.31s/it]                                                 {'loss': '0.477', 'grad_norm': '0.2412', 'learning_rate': '7.5e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6697', 'tokens/total': 28180480, 'tokens/trainable': 26227438, 'epoch': '0.8206'}
 40%|████      | 215/536 [20:00<12:22,  2.31s/it] 40%|████      | 216/536 [20:02<12:22,  2.32s/it][2026-03-16 19:31:37,309] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-216

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.85s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.85s/it]
 40%|████      | 217/536 [21:40<2:45:11, 31.07s/it] 41%|████      | 218/536 [21:43<1:59:15, 22.50s/it] 41%|████      | 219/536 [21:45<1:26:49, 16.43s/it] 41%|████      | 220/536 [21:47<1:04:08, 12.18s/it]                                                   {'loss': '0.4535', 'grad_norm': '0.2148', 'learning_rate': '7.358e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 28835840, 'tokens/trainable': 26833456, 'epoch': '0.8397'}
 41%|████      | 220/536 [21:47<1:04:08, 12.18s/it] 41%|████      | 221/536 [21:50<48:25,  9.22s/it]   41%|████▏     | 222/536 [21:52<37:25,  7.15s/it] 42%|████▏     | 223/536 [21:54<29:44,  5.70s/it] 42%|████▏     | 224/536 [21:57<24:17,  4.67s/it] 42%|████▏     | 225/536 [21:59<20:30,  3.96s/it]                                                 {'loss': '0.4639', 'grad_norm': '0.2197', 'learning_rate': '7.213e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 29491200, 'tokens/trainable': 27444416, 'epoch': '0.8588'}
 42%|████▏     | 225/536 [21:59<20:30,  3.96s/it] 42%|████▏     | 226/536 [22:01<17:50,  3.45s/it] 42%|████▏     | 227/536 [22:03<15:56,  3.10s/it] 43%|████▎     | 228/536 [22:06<14:40,  2.86s/it] 43%|████▎     | 229/536 [22:08<14:24,  2.82s/it] 43%|████▎     | 230/536 [22:11<14:01,  2.75s/it]                                                 {'loss': '0.4578', 'grad_norm': '0.2217', 'learning_rate': '7.066e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5816', 'tokens/total': 30146560, 'tokens/trainable': 28048432, 'epoch': '0.8779'}
 43%|████▎     | 230/536 [22:11<14:01,  2.75s/it] 43%|████▎     | 231/536 [22:13<13:21,  2.63s/it] 43%|████▎     | 232/536 [22:16<12:45,  2.52s/it] 43%|████▎     | 233/536 [22:18<12:29,  2.47s/it] 44%|████▎     | 234/536 [22:20<12:15,  2.44s/it] 44%|████▍     | 235/536 [22:23<11:59,  2.39s/it]                                                 {'loss': '0.4497', 'grad_norm': '0.2354', 'learning_rate': '6.917e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6580', 'tokens/total': 30801920, 'tokens/trainable': 28655952, 'epoch': '0.8969'}
 44%|████▍     | 235/536 [22:23<11:59,  2.39s/it] 44%|████▍     | 236/536 [22:25<11:55,  2.39s/it] 44%|████▍     | 237/536 [22:27<11:47,  2.37s/it] 44%|████▍     | 238/536 [22:30<11:38,  2.34s/it] 45%|████▍     | 239/536 [22:32<11:38,  2.35s/it] 45%|████▍     | 240/536 [22:34<11:23,  2.31s/it]                                                 {'loss': '0.4693', 'grad_norm': '0.2275', 'learning_rate': '6.766e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 31457280, 'tokens/trainable': 29262050, 'epoch': '0.916'}
 45%|████▍     | 240/536 [22:34<11:23,  2.31s/it] 45%|████▍     | 241/536 [22:37<11:19,  2.30s/it] 45%|████▌     | 242/536 [22:39<11:12,  2.29s/it] 45%|████▌     | 243/536 [22:41<11:10,  2.29s/it][2026-03-16 19:34:16,197] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-243

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.81s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it]
 46%|████▌     | 244/536 [24:21<2:33:31, 31.55s/it] 46%|████▌     | 245/536 [24:23<1:50:23, 22.76s/it]                                                   {'loss': '0.4629', 'grad_norm': '0.2178', 'learning_rate': '6.613e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 32112640, 'tokens/trainable': 29868356, 'epoch': '0.9351'}
 46%|████▌     | 245/536 [24:23<1:50:23, 22.76s/it] 46%|████▌     | 246/536 [24:25<1:20:19, 16.62s/it] 46%|████▌     | 247/536 [24:28<59:24, 12.33s/it]   46%|████▋     | 248/536 [24:30<45:07,  9.40s/it] 46%|████▋     | 249/536 [24:33<34:46,  7.27s/it] 47%|████▋     | 250/536 [24:35<27:30,  5.77s/it]                                                 {'loss': '0.474', 'grad_norm': '0.2539', 'learning_rate': '6.458e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6714', 'tokens/total': 32768000, 'tokens/trainable': 30473100, 'epoch': '0.9542'}
 47%|████▋     | 250/536 [24:35<27:30,  5.77s/it] 47%|████▋     | 251/536 [24:37<22:25,  4.72s/it] 47%|████▋     | 252/536 [24:39<18:50,  3.98s/it] 47%|████▋     | 253/536 [24:42<16:27,  3.49s/it] 47%|████▋     | 254/536 [24:44<14:46,  3.14s/it] 48%|████▊     | 255/536 [24:46<13:26,  2.87s/it]                                                 {'loss': '0.467', 'grad_norm': '0.2305', 'learning_rate': '6.302e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6772', 'tokens/total': 33423360, 'tokens/trainable': 31078478, 'epoch': '0.9733'}
 48%|████▊     | 255/536 [24:46<13:26,  2.87s/it] 48%|████▊     | 256/536 [24:49<12:32,  2.69s/it] 48%|████▊     | 257/536 [24:51<11:59,  2.58s/it] 48%|████▊     | 258/536 [24:53<11:39,  2.52s/it] 48%|████▊     | 259/536 [24:56<11:19,  2.45s/it] 49%|████▊     | 260/536 [24:58<11:02,  2.40s/it]                                                 {'loss': '0.4511', 'grad_norm': '0.2148', 'learning_rate': '6.144e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6619', 'tokens/total': 34078720, 'tokens/trainable': 31682612, 'epoch': '0.9924'}
 49%|████▊     | 260/536 [24:58<11:02,  2.40s/it] 49%|████▊     | 261/536 [25:00<10:50,  2.36s/it] 49%|████▉     | 262/536 [25:03<10:56,  2.40s/it] 49%|████▉     | 263/536 [25:06<12:20,  2.71s/it] 49%|████▉     | 264/536 [25:08<11:41,  2.58s/it] 49%|████▉     | 265/536 [25:11<11:22,  2.52s/it]                                                 {'loss': '0.4682', 'grad_norm': '0.2451', 'learning_rate': '5.985e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6472', 'tokens/total': 34734080, 'tokens/trainable': 32293470, 'epoch': '1.011'}
 49%|████▉     | 265/536 [25:11<11:22,  2.52s/it] 50%|████▉     | 266/536 [25:13<11:14,  2.50s/it] 50%|████▉     | 267/536 [25:16<11:28,  2.56s/it] 50%|█████     | 268/536 [25:18<11:16,  2.52s/it] 50%|█████     | 269/536 [25:21<10:53,  2.45s/it] 50%|█████     | 270/536 [25:23<10:39,  2.40s/it]                                                 {'loss': '0.461', 'grad_norm': '0.2207', 'learning_rate': '5.826e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6685', 'tokens/total': 35389440, 'tokens/trainable': 32904464, 'epoch': '1.031'}
 50%|█████     | 270/536 [25:23<10:39,  2.40s/it][2026-03-16 19:36:59,256] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-270

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.30s/it][AWriting model shards: 100%|██████████| 1/1 [00:17<00:00, 17.30s/it]
 51%|█████     | 271/536 [27:02<2:18:44, 31.41s/it] 51%|█████     | 272/536 [27:04<1:39:50, 22.69s/it] 51%|█████     | 273/536 [27:07<1:12:35, 16.56s/it] 51%|█████     | 274/536 [27:09<53:33, 12.26s/it]   51%|█████▏    | 275/536 [27:11<40:23,  9.29s/it]                                                 {'loss': '0.4545', 'grad_norm': '0.2324', 'learning_rate': '5.665e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6568', 'tokens/total': 36044800, 'tokens/trainable': 33517832, 'epoch': '1.05'}
 51%|█████▏    | 275/536 [27:11<40:23,  9.29s/it] 51%|█████▏    | 276/536 [27:15<32:37,  7.53s/it] 52%|█████▏    | 277/536 [27:17<25:45,  5.97s/it] 52%|█████▏    | 278/536 [27:19<20:59,  4.88s/it] 52%|█████▏    | 279/536 [27:22<17:35,  4.11s/it] 52%|█████▏    | 280/536 [27:24<15:12,  3.56s/it]                                                 {'loss': '0.447', 'grad_norm': '0.2158', 'learning_rate': '5.503e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 36700160, 'tokens/trainable': 34129632, 'epoch': '1.069'}
 52%|█████▏    | 280/536 [27:24<15:12,  3.56s/it] 52%|█████▏    | 281/536 [27:26<13:35,  3.20s/it] 53%|█████▎    | 282/536 [27:28<12:20,  2.91s/it] 53%|█████▎    | 283/536 [27:31<11:30,  2.73s/it] 53%|█████▎    | 284/536 [27:33<10:54,  2.60s/it] 53%|█████▎    | 285/536 [27:36<10:51,  2.60s/it]                                                 {'loss': '0.4378', 'grad_norm': '0.2119', 'learning_rate': '5.341e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5854', 'tokens/total': 37355520, 'tokens/trainable': 34742888, 'epoch': '1.088'}
 53%|█████▎    | 285/536 [27:36<10:51,  2.60s/it] 53%|█████▎    | 286/536 [27:38<10:42,  2.57s/it] 54%|█████▎    | 287/536 [27:41<10:26,  2.51s/it] 54%|█████▎    | 288/536 [27:43<10:02,  2.43s/it] 54%|█████▍    | 289/536 [27:45<09:45,  2.37s/it] 54%|█████▍    | 290/536 [27:47<09:40,  2.36s/it]                                                 {'loss': '0.4756', 'grad_norm': '0.2246', 'learning_rate': '5.179e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6534', 'tokens/total': 38010880, 'tokens/trainable': 35352848, 'epoch': '1.107'}
 54%|█████▍    | 290/536 [27:47<09:40,  2.36s/it] 54%|█████▍    | 291/536 [27:50<09:33,  2.34s/it] 54%|█████▍    | 292/536 [27:52<09:27,  2.33s/it] 55%|█████▍    | 293/536 [27:54<09:23,  2.32s/it] 55%|█████▍    | 294/536 [27:57<09:23,  2.33s/it] 55%|█████▌    | 295/536 [27:59<09:21,  2.33s/it]                                                 {'loss': '0.4635', 'grad_norm': '0.2188', 'learning_rate': '5.016e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6541', 'tokens/total': 38666240, 'tokens/trainable': 35964736, 'epoch': '1.126'}
 55%|█████▌    | 295/536 [27:59<09:21,  2.33s/it] 55%|█████▌    | 296/536 [28:01<09:16,  2.32s/it] 55%|█████▌    | 297/536 [28:03<09:09,  2.30s/it][2026-03-16 19:39:38,467] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-297

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it]
 56%|█████▌    | 298/536 [29:43<2:05:14, 31.57s/it] 56%|█████▌    | 299/536 [29:46<1:30:06, 22.81s/it] 56%|█████▌    | 300/536 [29:48<1:05:28, 16.65s/it]                                                   {'loss': '0.4578', 'grad_norm': '0.2334', 'learning_rate': '4.854e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6804', 'tokens/total': 39321600, 'tokens/trainable': 36579308, 'epoch': '1.145'}
 56%|█████▌    | 300/536 [29:48<1:05:28, 16.65s/it] 56%|█████▌    | 301/536 [29:50<48:19, 12.34s/it]   56%|█████▋    | 302/536 [29:53<36:18,  9.31s/it] 57%|█████▋    | 303/536 [29:55<27:59,  7.21s/it] 57%|█████▋    | 304/536 [29:57<22:22,  5.78s/it] 57%|█████▋    | 305/536 [30:00<18:50,  4.89s/it]                                                 {'loss': '0.4526', 'grad_norm': '0.2129', 'learning_rate': '4.691e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6214', 'tokens/total': 39976960, 'tokens/trainable': 37187912, 'epoch': '1.164'}
 57%|█████▋    | 305/536 [30:00<18:50,  4.89s/it] 57%|█████▋    | 306/536 [30:02<15:44,  4.11s/it] 57%|█████▋    | 307/536 [30:05<13:31,  3.55s/it] 57%|█████▋    | 308/536 [30:07<12:06,  3.19s/it] 58%|█████▊    | 309/536 [30:09<11:00,  2.91s/it] 58%|█████▊    | 310/536 [30:12<10:18,  2.74s/it]                                                 {'loss': '0.4482', 'grad_norm': '0.21', 'learning_rate': '4.529e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6526', 'tokens/total': 40632320, 'tokens/trainable': 37799984, 'epoch': '1.183'}
 58%|█████▊    | 310/536 [30:12<10:18,  2.74s/it] 58%|█████▊    | 311/536 [30:14<09:49,  2.62s/it] 58%|█████▊    | 312/536 [30:16<09:25,  2.53s/it] 58%|█████▊    | 313/536 [30:18<09:07,  2.45s/it] 59%|█████▊    | 314/536 [30:21<09:33,  2.58s/it] 59%|█████▉    | 315/536 [30:24<09:07,  2.48s/it]                                                 {'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '4.368e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6899', 'tokens/total': 41287680, 'tokens/trainable': 38409832, 'epoch': '1.202'}
 59%|█████▉    | 315/536 [30:24<09:07,  2.48s/it] 59%|█████▉    | 316/536 [30:26<08:55,  2.43s/it] 59%|█████▉    | 317/536 [30:28<08:44,  2.39s/it] 59%|█████▉    | 318/536 [30:31<08:35,  2.36s/it] 60%|█████▉    | 319/536 [30:33<08:25,  2.33s/it] 60%|█████▉    | 320/536 [30:35<08:19,  2.31s/it]                                                 {'loss': '0.4539', 'grad_norm': '0.2285', 'learning_rate': '4.207e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6757', 'tokens/total': 41943040, 'tokens/trainable': 39020096, 'epoch': '1.221'}
 60%|█████▉    | 320/536 [30:35<08:19,  2.31s/it] 60%|█████▉    | 321/536 [30:37<08:17,  2.31s/it] 60%|██████    | 322/536 [30:40<08:09,  2.29s/it] 60%|██████    | 323/536 [30:42<08:19,  2.35s/it] 60%|██████    | 324/536 [30:45<08:26,  2.39s/it][2026-03-16 19:42:20,100] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-324

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.49s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.49s/it]
 61%|██████    | 325/536 [32:22<1:48:54, 30.97s/it]                                                   {'loss': '0.4481', 'grad_norm': '0.2246', 'learning_rate': '4.046e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6887', 'tokens/total': 42598400, 'tokens/trainable': 39629160, 'epoch': '1.24'}
 61%|██████    | 325/536 [32:22<1:48:54, 30.97s/it] 61%|██████    | 326/536 [32:24<1:18:13, 22.35s/it] 61%|██████    | 327/536 [32:27<56:54, 16.34s/it]   61%|██████    | 328/536 [32:29<42:00, 12.12s/it] 61%|██████▏   | 329/536 [32:31<31:43,  9.20s/it] 62%|██████▏   | 330/536 [32:34<24:27,  7.12s/it]                                                 {'loss': '0.4542', 'grad_norm': '0.2256', 'learning_rate': '3.887e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6731', 'tokens/total': 43253760, 'tokens/trainable': 40237288, 'epoch': '1.26'}
 62%|██████▏   | 330/536 [32:34<24:27,  7.12s/it] 62%|██████▏   | 331/536 [32:36<19:21,  5.67s/it] 62%|██████▏   | 332/536 [32:39<16:57,  4.99s/it] 62%|██████▏   | 333/536 [32:42<14:06,  4.17s/it] 62%|██████▏   | 334/536 [32:44<12:07,  3.60s/it] 62%|██████▎   | 335/536 [32:46<10:43,  3.20s/it]                                                 {'loss': '0.4412', 'grad_norm': '0.2539', 'learning_rate': '3.729e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6745', 'tokens/total': 43909120, 'tokens/trainable': 40848032, 'epoch': '1.279'}
 62%|██████▎   | 335/536 [32:46<10:43,  3.20s/it] 63%|██████▎   | 336/536 [32:48<09:46,  2.93s/it] 63%|██████▎   | 337/536 [32:51<09:05,  2.74s/it] 63%|██████▎   | 338/536 [32:53<08:37,  2.61s/it] 63%|██████▎   | 339/536 [32:55<08:14,  2.51s/it] 63%|██████▎   | 340/536 [32:58<08:04,  2.47s/it]                                                 {'loss': '0.4615', 'grad_norm': '0.2217', 'learning_rate': '3.573e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6370', 'tokens/total': 44564480, 'tokens/trainable': 41457624, 'epoch': '1.298'}
 63%|██████▎   | 340/536 [32:58<08:04,  2.47s/it] 64%|██████▎   | 341/536 [33:00<08:05,  2.49s/it] 64%|██████▍   | 342/536 [33:03<07:51,  2.43s/it] 64%|██████▍   | 343/536 [33:05<07:49,  2.43s/it] 64%|██████▍   | 344/536 [33:08<07:51,  2.46s/it] 64%|██████▍   | 345/536 [33:10<07:38,  2.40s/it]                                                 {'loss': '0.4599', 'grad_norm': '0.2188', 'learning_rate': '3.418e-06', 'ppl': '1.584', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6795', 'tokens/total': 45219840, 'tokens/trainable': 42069272, 'epoch': '1.317'}
 64%|██████▍   | 345/536 [33:10<07:38,  2.40s/it] 65%|██████▍   | 346/536 [33:12<07:29,  2.37s/it] 65%|██████▍   | 347/536 [33:14<07:26,  2.36s/it] 65%|██████▍   | 348/536 [33:17<07:20,  2.34s/it] 65%|██████▌   | 349/536 [33:19<07:12,  2.32s/it] 65%|██████▌   | 350/536 [33:21<07:09,  2.31s/it]                                                 {'loss': '0.4499', 'grad_norm': '0.2148', 'learning_rate': '3.264e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6691', 'tokens/total': 45875200, 'tokens/trainable': 42681132, 'epoch': '1.336'}
 65%|██████▌   | 350/536 [33:21<07:09,  2.31s/it] 65%|██████▌   | 351/536 [33:24<07:04,  2.29s/it][2026-03-16 19:44:58,459] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-351

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.55s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.55s/it]
 66%|██████▌   | 352/536 [35:02<1:35:17, 31.07s/it] 66%|██████▌   | 353/536 [35:04<1:08:31, 22.47s/it] 66%|██████▌   | 354/536 [35:06<49:47, 16.42s/it]   66%|██████▌   | 355/536 [35:09<36:49, 12.21s/it]                                                 {'loss': '0.4529', 'grad_norm': '0.249', 'learning_rate': '3.113e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6371', 'tokens/total': 46530560, 'tokens/trainable': 43292904, 'epoch': '1.355'}
 66%|██████▌   | 355/536 [35:09<36:49, 12.21s/it] 66%|██████▋   | 356/536 [35:11<27:37,  9.21s/it] 67%|██████▋   | 357/536 [35:13<21:14,  7.12s/it] 67%|██████▋   | 358/536 [35:16<16:48,  5.67s/it] 67%|██████▋   | 359/536 [35:18<13:56,  4.73s/it] 67%|██████▋   | 360/536 [35:21<11:51,  4.04s/it]                                                 {'loss': '0.4461', 'grad_norm': '0.2207', 'learning_rate': '2.963e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6150', 'tokens/total': 47185920, 'tokens/trainable': 43900272, 'epoch': '1.374'}
 67%|██████▋   | 360/536 [35:21<11:51,  4.04s/it] 67%|██████▋   | 361/536 [35:23<10:14,  3.51s/it] 68%|██████▊   | 362/536 [35:25<09:24,  3.25s/it] 68%|██████▊   | 363/536 [35:28<08:33,  2.97s/it] 68%|██████▊   | 364/536 [35:30<07:52,  2.75s/it] 68%|██████▊   | 365/536 [35:32<07:23,  2.59s/it]                                                 {'loss': '0.4581', 'grad_norm': '0.3555', 'learning_rate': '2.816e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6767', 'tokens/total': 47841280, 'tokens/trainable': 44509872, 'epoch': '1.393'}
 68%|██████▊   | 365/536 [35:32<07:23,  2.59s/it] 68%|██████▊   | 366/536 [35:34<07:04,  2.50s/it] 68%|██████▊   | 367/536 [35:37<06:49,  2.42s/it] 69%|██████▊   | 368/536 [35:39<06:39,  2.38s/it] 69%|██████▉   | 369/536 [35:41<06:33,  2.36s/it] 69%|██████▉   | 370/536 [35:44<06:33,  2.37s/it]                                                 {'loss': '0.4483', 'grad_norm': '0.2109', 'learning_rate': '2.671e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6359', 'tokens/total': 48496640, 'tokens/trainable': 45121444, 'epoch': '1.412'}
 69%|██████▉   | 370/536 [35:44<06:33,  2.37s/it] 69%|██████▉   | 371/536 [35:46<06:27,  2.35s/it] 69%|██████▉   | 372/536 [35:48<06:25,  2.35s/it] 70%|██████▉   | 373/536 [35:51<06:17,  2.31s/it] 70%|██████▉   | 374/536 [35:53<06:12,  2.30s/it] 70%|██████▉   | 375/536 [35:55<06:13,  2.32s/it]                                                 {'loss': '0.4475', 'grad_norm': '0.2617', 'learning_rate': '2.528e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6505', 'tokens/total': 49152000, 'tokens/trainable': 45733296, 'epoch': '1.431'}
 70%|██████▉   | 375/536 [35:55<06:13,  2.32s/it] 70%|███████   | 376/536 [35:58<06:09,  2.31s/it] 70%|███████   | 377/536 [36:00<06:26,  2.43s/it] 71%|███████   | 378/536 [36:03<06:17,  2.39s/it][2026-03-16 19:47:37,290] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-378

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.77s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.77s/it]
 71%|███████   | 379/536 [37:39<1:20:25, 30.73s/it] 71%|███████   | 380/536 [37:42<57:44, 22.21s/it]                                                   {'loss': '0.4467', 'grad_norm': '0.208', 'learning_rate': '2.388e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6639', 'tokens/total': 49807360, 'tokens/trainable': 46341868, 'epoch': '1.45'}
 71%|███████   | 380/536 [37:42<57:44, 22.21s/it] 71%|███████   | 381/536 [37:44<41:54, 16.22s/it] 71%|███████▏  | 382/536 [37:46<31:01, 12.09s/it] 71%|███████▏  | 383/536 [37:49<23:20,  9.16s/it] 72%|███████▏  | 384/536 [37:51<17:57,  7.09s/it] 72%|███████▏  | 385/536 [37:53<14:15,  5.66s/it]                                                 {'loss': '0.4373', 'grad_norm': '0.2129', 'learning_rate': '2.251e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6487', 'tokens/total': 50462720, 'tokens/trainable': 46948144, 'epoch': '1.469'}
 72%|███████▏  | 385/536 [37:53<14:15,  5.66s/it] 72%|███████▏  | 386/536 [37:56<11:39,  4.67s/it] 72%|███████▏  | 387/536 [37:58<09:46,  3.94s/it] 72%|███████▏  | 388/536 [38:00<08:31,  3.45s/it] 73%|███████▎  | 389/536 [38:03<07:40,  3.14s/it] 73%|███████▎  | 390/536 [38:05<07:00,  2.88s/it]                                                 {'loss': '0.452', 'grad_norm': '0.2314', 'learning_rate': '2.117e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 51118080, 'tokens/trainable': 47558056, 'epoch': '1.489'}
 73%|███████▎  | 390/536 [38:05<07:00,  2.88s/it] 73%|███████▎  | 391/536 [38:08<07:22,  3.05s/it] 73%|███████▎  | 392/536 [38:11<06:49,  2.85s/it] 73%|███████▎  | 393/536 [38:13<06:22,  2.68s/it] 74%|███████▎  | 394/536 [38:15<06:02,  2.55s/it] 74%|███████▎  | 395/536 [38:18<06:01,  2.56s/it]                                                 {'loss': '0.4435', 'grad_norm': '0.2139', 'learning_rate': '1.985e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5877', 'tokens/total': 51773440, 'tokens/trainable': 48168520, 'epoch': '1.508'}
 74%|███████▎  | 395/536 [38:18<06:01,  2.56s/it] 74%|███████▍  | 396/536 [38:20<05:47,  2.48s/it] 74%|███████▍  | 397/536 [38:22<05:35,  2.42s/it] 74%|███████▍  | 398/536 [38:25<05:27,  2.37s/it] 74%|███████▍  | 399/536 [38:27<05:22,  2.35s/it] 75%|███████▍  | 400/536 [38:29<05:21,  2.37s/it]                                                 {'loss': '0.4444', 'grad_norm': '0.2236', 'learning_rate': '1.857e-06', 'ppl': '1.56', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6406', 'tokens/total': 52428800, 'tokens/trainable': 48779416, 'epoch': '1.527'}
 75%|███████▍  | 400/536 [38:29<05:21,  2.37s/it] 75%|███████▍  | 401/536 [38:32<05:15,  2.34s/it] 75%|███████▌  | 402/536 [38:34<05:10,  2.32s/it] 75%|███████▌  | 403/536 [38:36<05:10,  2.34s/it] 75%|███████▌  | 404/536 [38:39<05:07,  2.33s/it] 76%|███████▌  | 405/536 [38:41<05:03,  2.32s/it]                                                 {'loss': '0.4557', 'grad_norm': '0.2324', 'learning_rate': '1.732e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6719', 'tokens/total': 53084160, 'tokens/trainable': 49387032, 'epoch': '1.546'}
 76%|███████▌  | 405/536 [38:41<05:03,  2.32s/it][2026-03-16 19:50:15,755] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-405

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.46s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.46s/it]
 76%|███████▌  | 406/536 [40:19<1:06:58, 30.91s/it] 76%|███████▌  | 407/536 [40:21<48:00, 22.33s/it]   76%|███████▌  | 408/536 [40:23<34:49, 16.33s/it] 76%|███████▋  | 409/536 [40:25<25:39, 12.12s/it] 76%|███████▋  | 410/536 [40:28<19:38,  9.36s/it]                                                 {'loss': '0.4617', 'grad_norm': '0.2168', 'learning_rate': '1.611e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5250', 'tokens/total': 53739520, 'tokens/trainable': 49995344, 'epoch': '1.565'}
 76%|███████▋  | 410/536 [40:28<19:38,  9.36s/it] 77%|███████▋  | 411/536 [40:31<15:03,  7.23s/it] 77%|███████▋  | 412/536 [40:33<11:54,  5.76s/it] 77%|███████▋  | 413/536 [40:36<09:53,  4.83s/it] 77%|███████▋  | 414/536 [40:38<08:16,  4.07s/it] 77%|███████▋  | 415/536 [40:40<07:07,  3.53s/it]                                                 {'loss': '0.4492', 'grad_norm': '0.2217', 'learning_rate': '1.493e-06', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6671', 'tokens/total': 54394880, 'tokens/trainable': 50603264, 'epoch': '1.584'}
 77%|███████▋  | 415/536 [40:40<07:07,  3.53s/it] 78%|███████▊  | 416/536 [40:43<06:19,  3.17s/it] 78%|███████▊  | 417/536 [40:45<05:45,  2.90s/it] 78%|███████▊  | 418/536 [40:47<05:26,  2.77s/it] 78%|███████▊  | 419/536 [40:50<05:18,  2.73s/it] 78%|███████▊  | 420/536 [40:52<05:02,  2.61s/it]                                                 {'loss': '0.4522', 'grad_norm': '0.2676', 'learning_rate': '1.379e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 55050240, 'tokens/trainable': 51213612, 'epoch': '1.603'}
 78%|███████▊  | 420/536 [40:52<05:02,  2.61s/it] 79%|███████▊  | 421/536 [40:55<04:49,  2.52s/it] 79%|███████▊  | 422/536 [40:57<04:40,  2.46s/it] 79%|███████▉  | 423/536 [40:59<04:40,  2.49s/it] 79%|███████▉  | 424/536 [41:02<04:29,  2.41s/it] 79%|███████▉  | 425/536 [41:04<04:21,  2.36s/it]                                                 {'loss': '0.4414', 'grad_norm': '0.2168', 'learning_rate': '1.269e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6755', 'tokens/total': 55705600, 'tokens/trainable': 51819592, 'epoch': '1.622'}
 79%|███████▉  | 425/536 [41:04<04:21,  2.36s/it] 79%|███████▉  | 426/536 [41:06<04:17,  2.34s/it] 80%|███████▉  | 427/536 [41:08<04:13,  2.32s/it] 80%|███████▉  | 428/536 [41:11<04:08,  2.30s/it] 80%|████████  | 429/536 [41:13<04:07,  2.31s/it] 80%|████████  | 430/536 [41:15<04:04,  2.31s/it]                                                 {'loss': '0.4532', 'grad_norm': '0.2217', 'learning_rate': '1.163e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6642', 'tokens/total': 56360960, 'tokens/trainable': 52431520, 'epoch': '1.641'}
 80%|████████  | 430/536 [41:15<04:04,  2.31s/it] 80%|████████  | 431/536 [41:18<04:10,  2.38s/it] 81%|████████  | 432/536 [41:20<04:04,  2.35s/it][2026-03-16 19:52:55,057] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-432

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.45s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.45s/it]
 81%|████████  | 433/536 [42:58<53:14, 31.01s/it] 81%|████████  | 434/536 [43:00<38:05, 22.41s/it] 81%|████████  | 435/536 [43:03<27:33, 16.37s/it]                                                 {'loss': '0.4605', 'grad_norm': '0.3574', 'learning_rate': '1.061e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 57016320, 'tokens/trainable': 53041944, 'epoch': '1.66'}
 81%|████████  | 435/536 [43:03<27:33, 16.37s/it] 81%|████████▏ | 436/536 [43:05<20:20, 12.20s/it] 82%|████████▏ | 437/536 [43:07<15:11,  9.20s/it] 82%|████████▏ | 438/536 [43:10<11:41,  7.16s/it] 82%|████████▏ | 439/536 [43:12<09:11,  5.68s/it] 82%|████████▏ | 440/536 [43:14<07:29,  4.68s/it]                                                 {'loss': '0.446', 'grad_norm': '0.2119', 'learning_rate': '9.626e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6513', 'tokens/total': 57671680, 'tokens/trainable': 53647180, 'epoch': '1.679'}
 82%|████████▏ | 440/536 [43:14<07:29,  4.68s/it] 82%|████████▏ | 441/536 [43:17<06:23,  4.04s/it] 82%|████████▏ | 442/536 [43:19<05:31,  3.53s/it] 83%|████████▎ | 443/536 [43:21<04:53,  3.16s/it] 83%|████████▎ | 444/536 [43:24<04:26,  2.90s/it] 83%|████████▎ | 445/536 [43:26<04:04,  2.69s/it]                                                 {'loss': '0.4299', 'grad_norm': '0.2188', 'learning_rate': '8.688e-07', 'ppl': '1.537', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6951', 'tokens/total': 58327040, 'tokens/trainable': 54256432, 'epoch': '1.698'}
 83%|████████▎ | 445/536 [43:26<04:04,  2.69s/it] 83%|████████▎ | 446/536 [43:28<03:52,  2.58s/it] 83%|████████▎ | 447/536 [43:31<03:41,  2.49s/it] 84%|████████▎ | 448/536 [43:33<03:32,  2.41s/it] 84%|████████▍ | 449/536 [43:35<03:30,  2.42s/it] 84%|████████▍ | 450/536 [43:38<03:23,  2.37s/it]                                                 {'loss': '0.4583', 'grad_norm': '0.2188', 'learning_rate': '7.794e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6777', 'tokens/total': 58982400, 'tokens/trainable': 54863148, 'epoch': '1.718'}
 84%|████████▍ | 450/536 [43:38<03:23,  2.37s/it] 84%|████████▍ | 451/536 [43:40<03:28,  2.46s/it] 84%|████████▍ | 452/536 [43:43<03:24,  2.44s/it] 85%|████████▍ | 453/536 [43:45<03:19,  2.40s/it] 85%|████████▍ | 454/536 [43:47<03:15,  2.38s/it] 85%|████████▍ | 455/536 [43:50<03:12,  2.38s/it]                                                 {'loss': '0.4523', 'grad_norm': '0.2119', 'learning_rate': '6.945e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6454', 'tokens/total': 59637760, 'tokens/trainable': 55471580, 'epoch': '1.737'}
 85%|████████▍ | 455/536 [43:50<03:12,  2.38s/it] 85%|████████▌ | 456/536 [43:52<03:15,  2.44s/it] 85%|████████▌ | 457/536 [43:54<03:10,  2.41s/it] 85%|████████▌ | 458/536 [43:57<03:11,  2.46s/it] 86%|████████▌ | 459/536 [43:59<03:05,  2.41s/it][2026-03-16 19:55:34,637] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-459

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.63s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.63s/it]
 86%|████████▌ | 460/536 [45:37<39:11, 30.94s/it]                                                 {'loss': '0.4523', 'grad_norm': '0.2148', 'learning_rate': '6.141e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6711', 'tokens/total': 60293120, 'tokens/trainable': 56081832, 'epoch': '1.756'}
 86%|████████▌ | 460/536 [45:37<39:11, 30.94s/it] 86%|████████▌ | 461/536 [45:39<27:55, 22.34s/it] 86%|████████▌ | 462/536 [45:41<20:06, 16.30s/it] 86%|████████▋ | 463/536 [45:44<14:42, 12.09s/it] 87%|████████▋ | 464/536 [45:46<10:58,  9.14s/it] 87%|████████▋ | 465/536 [45:48<08:21,  7.06s/it]                                                 {'loss': '0.4461', 'grad_norm': '0.4551', 'learning_rate': '5.383e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6903', 'tokens/total': 60948480, 'tokens/trainable': 56694240, 'epoch': '1.775'}
 87%|████████▋ | 465/536 [45:48<08:21,  7.06s/it] 87%|████████▋ | 466/536 [45:50<06:33,  5.62s/it] 87%|████████▋ | 467/536 [45:53<05:23,  4.69s/it] 87%|████████▋ | 468/536 [45:55<04:29,  3.97s/it] 88%|████████▊ | 469/536 [45:57<03:50,  3.44s/it] 88%|████████▊ | 470/536 [46:01<03:45,  3.42s/it]                                                 {'loss': '0.4341', 'grad_norm': '0.208', 'learning_rate': '4.673e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4512', 'tokens/total': 61603840, 'tokens/trainable': 57301404, 'epoch': '1.794'}
 88%|████████▊ | 470/536 [46:01<03:45,  3.42s/it] 88%|████████▊ | 471/536 [46:03<03:20,  3.09s/it] 88%|████████▊ | 472/536 [46:05<03:02,  2.86s/it] 88%|████████▊ | 473/536 [46:08<02:50,  2.70s/it] 88%|████████▊ | 474/536 [46:10<02:39,  2.58s/it] 89%|████████▊ | 475/536 [46:12<02:34,  2.54s/it]                                                 {'loss': '0.4627', 'grad_norm': '0.2461', 'learning_rate': '4.011e-07', 'ppl': '1.588', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6249', 'tokens/total': 62259200, 'tokens/trainable': 57910216, 'epoch': '1.813'}
 89%|████████▊ | 475/536 [46:12<02:34,  2.54s/it] 89%|████████▉ | 476/536 [46:15<02:26,  2.45s/it] 89%|████████▉ | 477/536 [46:17<02:24,  2.45s/it] 89%|████████▉ | 478/536 [46:20<02:25,  2.51s/it] 89%|████████▉ | 479/536 [46:22<02:19,  2.45s/it] 90%|████████▉ | 480/536 [46:24<02:13,  2.39s/it]                                                 {'loss': '0.4538', 'grad_norm': '0.2178', 'learning_rate': '3.397e-07', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6635', 'tokens/total': 62914560, 'tokens/trainable': 58517712, 'epoch': '1.832'}
 90%|████████▉ | 480/536 [46:24<02:13,  2.39s/it] 90%|████████▉ | 481/536 [46:27<02:09,  2.36s/it] 90%|████████▉ | 482/536 [46:29<02:05,  2.33s/it] 90%|█████████ | 483/536 [46:31<02:03,  2.33s/it] 90%|█████████ | 484/536 [46:34<02:01,  2.33s/it] 90%|█████████ | 485/536 [46:36<02:04,  2.44s/it]                                                 {'loss': '0.4395', 'grad_norm': '0.208', 'learning_rate': '2.833e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6504', 'tokens/total': 63569920, 'tokens/trainable': 59125608, 'epoch': '1.851'}
 90%|█████████ | 485/536 [46:36<02:04,  2.44s/it] 91%|█████████ | 486/536 [46:39<01:59,  2.39s/it][2026-03-16 19:58:13,418] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-486

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it]
 91%|█████████ | 487/536 [48:16<25:09, 30.81s/it] 91%|█████████ | 488/536 [48:18<17:48, 22.26s/it] 91%|█████████ | 489/536 [48:20<12:44, 16.26s/it] 91%|█████████▏| 490/536 [48:24<09:32, 12.45s/it]                                                 {'loss': '0.4478', 'grad_norm': '0.2236', 'learning_rate': '2.318e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4290', 'tokens/total': 64225280, 'tokens/trainable': 59733412, 'epoch': '1.87'}
 91%|█████████▏| 490/536 [48:24<09:32, 12.45s/it] 92%|█████████▏| 491/536 [48:26<07:02,  9.39s/it] 92%|█████████▏| 492/536 [48:28<05:19,  7.26s/it] 92%|█████████▏| 493/536 [48:31<04:08,  5.78s/it] 92%|█████████▏| 494/536 [48:33<03:22,  4.82s/it] 92%|█████████▏| 495/536 [48:36<02:47,  4.08s/it]                                                 {'loss': '0.4362', 'grad_norm': '0.2129', 'learning_rate': '1.854e-07', 'ppl': '1.547', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6524', 'tokens/total': 64880640, 'tokens/trainable': 60339904, 'epoch': '1.889'}
 92%|█████████▏| 495/536 [48:36<02:47,  4.08s/it] 93%|█████████▎| 496/536 [48:38<02:25,  3.64s/it] 93%|█████████▎| 497/536 [48:40<02:06,  3.23s/it] 93%|█████████▎| 498/536 [48:43<01:52,  2.96s/it] 93%|█████████▎| 499/536 [48:45<01:42,  2.76s/it] 93%|█████████▎| 500/536 [48:47<01:34,  2.62s/it]                                                 {'loss': '0.4656', 'grad_norm': '0.2217', 'learning_rate': '1.441e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6649', 'tokens/total': 65536000, 'tokens/trainable': 60945412, 'epoch': '1.908'}
 93%|█████████▎| 500/536 [48:47<01:34,  2.62s/it] 93%|█████████▎| 501/536 [48:50<01:29,  2.55s/it] 94%|█████████▎| 502/536 [48:52<01:25,  2.52s/it] 94%|█████████▍| 503/536 [48:54<01:20,  2.44s/it] 94%|█████████▍| 504/536 [48:57<01:16,  2.39s/it] 94%|█████████▍| 505/536 [48:59<01:13,  2.36s/it]                                                 {'loss': '0.4466', 'grad_norm': '0.2129', 'learning_rate': '1.079e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6658', 'tokens/total': 66191360, 'tokens/trainable': 61550936, 'epoch': '1.927'}
 94%|█████████▍| 505/536 [48:59<01:13,  2.36s/it] 94%|█████████▍| 506/536 [49:01<01:09,  2.33s/it] 95%|█████████▍| 507/536 [49:04<01:07,  2.31s/it] 95%|█████████▍| 508/536 [49:06<01:04,  2.31s/it] 95%|█████████▍| 509/536 [49:08<01:04,  2.37s/it] 95%|█████████▌| 510/536 [49:11<01:01,  2.35s/it]                                                 {'loss': '0.4754', 'grad_norm': '0.2168', 'learning_rate': '7.691e-08', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 66846720, 'tokens/trainable': 62157088, 'epoch': '1.947'}
 95%|█████████▌| 510/536 [49:11<01:01,  2.35s/it] 95%|█████████▌| 511/536 [49:13<00:58,  2.34s/it] 96%|█████████▌| 512/536 [49:15<00:55,  2.32s/it] 96%|█████████▌| 513/536 [49:18<00:53,  2.31s/it][2026-03-16 20:00:52,618] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-513

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.79s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.79s/it]
 96%|█████████▌| 514/536 [50:57<11:33, 31.50s/it] 96%|█████████▌| 515/536 [51:00<07:58, 22.77s/it]                                                 {'loss': '0.4548', 'grad_norm': '0.2217', 'learning_rate': '5.11e-08', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6342', 'tokens/total': 67502080, 'tokens/trainable': 62762592, 'epoch': '1.966'}
 96%|█████████▌| 515/536 [51:00<07:58, 22.77s/it] 96%|█████████▋| 516/536 [51:02<05:32, 16.65s/it] 96%|█████████▋| 517/536 [51:04<03:54, 12.32s/it] 97%|█████████▋| 518/536 [51:06<02:47,  9.30s/it] 97%|█████████▋| 519/536 [51:09<02:02,  7.18s/it] 97%|█████████▋| 520/536 [51:11<01:32,  5.76s/it]                                                 {'loss': '0.4544', 'grad_norm': '0.2314', 'learning_rate': '3.054e-08', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6125', 'tokens/total': 68157440, 'tokens/trainable': 63366800, 'epoch': '1.985'}
 97%|█████████▋| 520/536 [51:11<01:32,  5.76s/it] 97%|█████████▋| 521/536 [51:13<01:10,  4.72s/it] 97%|█████████▋| 522/536 [51:16<00:55,  3.98s/it] 98%|█████████▊| 523/536 [51:18<00:45,  3.47s/it] 98%|█████████▊| 524/536 [51:20<00:38,  3.19s/it] 98%|█████████▊| 525/536 [51:24<00:35,  3.21s/it]                                                 {'loss': '0.4504', 'grad_norm': '0.2246', 'learning_rate': '1.522e-08', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6753', 'tokens/total': 68812800, 'tokens/trainable': 63975056, 'epoch': '2.004'}
 98%|█████████▊| 525/536 [51:24<00:35,  3.21s/it] 98%|█████████▊| 526/536 [51:26<00:29,  2.92s/it] 98%|█████████▊| 527/536 [51:28<00:24,  2.75s/it] 99%|█████████▊| 528/536 [51:31<00:21,  2.71s/it] 99%|█████████▊| 529/536 [51:33<00:18,  2.60s/it] 99%|█████████▉| 530/536 [51:36<00:15,  2.52s/it]                                                 {'loss': '0.4546', 'grad_norm': '0.2109', 'learning_rate': '5.182e-09', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6575', 'tokens/total': 69468160, 'tokens/trainable': 64586448, 'epoch': '2.023'}
 99%|█████████▉| 530/536 [51:36<00:15,  2.52s/it] 99%|█████████▉| 531/536 [51:38<00:12,  2.45s/it] 99%|█████████▉| 532/536 [51:40<00:09,  2.40s/it] 99%|█████████▉| 533/536 [51:43<00:07,  2.40s/it]100%|█████████▉| 534/536 [51:45<00:04,  2.39s/it]100%|█████████▉| 535/536 [51:47<00:02,  2.42s/it]                                                 {'loss': '0.4493', 'grad_norm': '0.2314', 'learning_rate': '4.231e-10', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6115', 'tokens/total': 70123520, 'tokens/trainable': 65199364, 'epoch': '2.042'}
100%|█████████▉| 535/536 [51:47<00:02,  2.42s/it]100%|██████████| 536/536 [51:50<00:00,  2.37s/it][2026-03-16 20:03:25,941] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-536

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|██████████| 1/1 [00:16<00:00, 16.68s/it][AWriting model shards: 100%|██████████| 1/1 [00:16<00:00, 16.68s/it]
                                                 {'train_runtime': '3210', 'train_samples_per_second': '2.672', 'train_steps_per_second': '0.167', 'train_loss': '0.4897', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'epoch': '2.046', 'tokens/train_per_sec_per_gpu': '6757'}
100%|██████████| 536/536 [53:26<00:00,  2.37s/it]100%|██████████| 536/536 [53:26<00:00,  5.98s/it]
[2026-03-16 20:04:52,263] [INFO] [axolotl.train.save_trained_model:237] [PID:213] Training completed! Saving trained model to ./outputs/qwen3-sft-stmt-tk/.
[2026-03-16 20:05:01,009] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/
Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.53s/it]Writing model shards: 100%|██████████| 1/1 [00:17<00:00, 17.53s/it]
[2026-03-16 20:05:19,091] [INFO] [axolotl.train.save_trained_model:351] [PID:213] Model successfully saved to ./outputs/qwen3-sft-stmt-tk/