File size: 111,686 Bytes
5270841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
[2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB ()
[2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "qwen3-sft-stmt-tk.yml",
  "base_model": "Qwen/Qwen3-8B",
  "base_model_config": "Qwen/Qwen3-8B",
  "batch_size": 16,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 8,
    "n_node": 1
  },
  "chat_template": "qwen3",
  "chat_template_kwargs": {
    "enable_thinking": false
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 8,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 192,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "xiaolesu/lean4-sft-stmt-tk",
      "split": "train",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "evals_per_epoch": 10,
  "experimental_skip_move_to_device": true,
  "flex_attention": true,
  "flex_attn_compile_kwargs": {
    "dynamic": false,
    "mode": "max-autotune-no-cudagraphs"
  },
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "activation_checkpointing": true,
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "fsdp_version": 2,
    "offload_params": false,
    "reshard_after_forward": true,
    "state_dict_type": "FULL_STATE_DICT",
    "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
  },
  "fsdp_version": 2,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "include_tkps": true,
  "learning_rate": 1e-05,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 5,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "qwen3",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/qwen3-sft-stmt-tk/",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.05,
  "save_total_limit": 3,
  "saves_per_epoch": 10,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3-8B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "qwen3-8b-tk-run1",
  "wandb_project": "qwen3-sft-stmt-tk",
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 8
}
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11
[2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets...
[2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.

Fetching 0 files: 0it [00:00, ?it/s]
Fetching 0 files: 0it [00:00, ?it/s]
[2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205
[2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159

Dropping Invalid Sequences (<None or >8192) (num_proc=192):   0%|          | 0/11192 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):   1%|          | 59/11192 [00:02<06:34, 28.25 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):   3%|β–Ž         | 295/11192 [00:02<01:02, 175.65 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):   6%|β–Œ         | 649/11192 [00:02<00:23, 453.06 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):   8%|β–Š         | 885/11192 [00:02<00:16, 634.46 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  10%|β–ˆ         | 1121/11192 [00:02<00:11, 849.04 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  13%|β–ˆβ–Ž        | 1416/11192 [00:02<00:08, 1166.00 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  15%|β–ˆβ–Œ        | 1711/11192 [00:02<00:06, 1480.17 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  18%|β–ˆβ–Š        | 2006/11192 [00:02<00:05, 1697.58 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  21%|β–ˆβ–ˆ        | 2301/11192 [00:02<00:04, 1949.74 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  23%|β–ˆβ–ˆβ–Ž       | 2596/11192 [00:03<00:04, 2145.10 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  26%|β–ˆβ–ˆβ–Œ       | 2891/11192 [00:03<00:03, 2324.57 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  29%|β–ˆβ–ˆβ–‰       | 3245/11192 [00:03<00:03, 2566.75 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192):  70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 7828/11192 [00:03<00:00, 14035.00 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 11192/11192 [00:04<00:00, 2753.84 examples/s]
[2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192])

Drop Samples with Zero Trainable Tokens (num_proc=192):   0%|          | 0/10830 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):   1%|          | 57/10830 [00:02<06:27, 27.78 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):   3%|β–Ž         | 285/10830 [00:02<01:00, 173.64 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):   4%|▍         | 456/10830 [00:02<00:34, 299.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):   6%|β–‹         | 684/10830 [00:02<00:20, 506.62 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):   8%|β–Š         | 912/10830 [00:02<00:13, 736.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  11%|β–ˆ         | 1140/10830 [00:02<00:10, 947.17 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  13%|β–ˆβ–Ž        | 1368/10830 [00:02<00:08, 1094.03 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  15%|β–ˆβ–        | 1596/10830 [00:02<00:07, 1269.49 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  17%|β–ˆβ–‹        | 1824/10830 [00:02<00:06, 1437.65 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  19%|β–ˆβ–‰        | 2052/10830 [00:03<00:05, 1614.63 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  21%|β–ˆβ–ˆ        | 2280/10830 [00:03<00:05, 1635.72 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  23%|β–ˆβ–ˆβ–Ž       | 2508/10830 [00:03<00:04, 1732.21 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  25%|β–ˆβ–ˆβ–Œ       | 2736/10830 [00:03<00:04, 1721.60 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  27%|β–ˆβ–ˆβ–‹       | 2964/10830 [00:03<00:04, 1703.27 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  29%|β–ˆβ–ˆβ–‰       | 3192/10830 [00:03<00:04, 1798.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  32%|β–ˆβ–ˆβ–ˆβ–      | 3477/10830 [00:03<00:03, 1958.86 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  34%|β–ˆβ–ˆβ–ˆβ–      | 3705/10830 [00:03<00:03, 2037.08 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  36%|β–ˆβ–ˆβ–ˆβ–‹      | 3933/10830 [00:04<00:03, 2067.96 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  38%|β–ˆβ–ˆβ–ˆβ–Š      | 4161/10830 [00:04<00:03, 2091.19 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 4389/10830 [00:04<00:05, 1127.36 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 4670/10830 [00:04<00:04, 1385.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 4894/10830 [00:04<00:04, 1432.10 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192):  69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 7526/10830 [00:04<00:00, 6499.14 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:05<00:00, 1931.57 examples/s]

Add position_id column (Sample Packing) (num_proc=192):   0%|          | 0/10830 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=192):   1%|          | 57/10830 [00:02<06:33, 27.40 examples/s]
Add position_id column (Sample Packing) (num_proc=192):   2%|▏         | 228/10830 [00:02<01:18, 135.14 examples/s]
Add position_id column (Sample Packing) (num_proc=192):   4%|▍         | 456/10830 [00:02<00:33, 310.31 examples/s]
Add position_id column (Sample Packing) (num_proc=192):   8%|β–Š         | 912/10830 [00:02<00:14, 692.10 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  11%|β–ˆ         | 1140/10830 [00:02<00:11, 858.26 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  13%|β–ˆβ–Ž        | 1368/10830 [00:02<00:09, 1027.56 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  15%|β–ˆβ–        | 1596/10830 [00:02<00:07, 1182.55 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  17%|β–ˆβ–‹        | 1881/10830 [00:02<00:06, 1425.26 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  20%|β–ˆβ–ˆ        | 2166/10830 [00:03<00:05, 1604.97 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  22%|β–ˆβ–ˆβ–       | 2394/10830 [00:03<00:04, 1738.29 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  25%|β–ˆβ–ˆβ–       | 2679/10830 [00:03<00:04, 1951.23 examples/s]
Add position_id column (Sample Packing) (num_proc=192):  63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 6854/10830 [00:03<00:00, 11681.66 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:04<00:00, 2621.72 examples/s]

Saving the dataset (0/42 shards):   0%|          | 0/10830 [00:00<?, ? examples/s]
Saving the dataset (0/42 shards):   2%|▏         | 258/10830 [00:00<00:22, 464.02 examples/s]
Saving the dataset (1/42 shards):   2%|▏         | 258/10830 [00:00<00:22, 464.02 examples/s]
Saving the dataset (2/42 shards):   7%|β–‹         | 774/10830 [00:00<00:21, 464.02 examples/s]
Saving the dataset (3/42 shards):   7%|β–‹         | 774/10830 [00:00<00:21, 464.02 examples/s]
Saving the dataset (4/42 shards):  14%|β–ˆβ–        | 1548/10830 [00:00<00:20, 464.02 examples/s]
Saving the dataset (5/42 shards):  14%|β–ˆβ–        | 1548/10830 [00:00<00:20, 464.02 examples/s]
Saving the dataset (6/42 shards):  17%|β–ˆβ–‹        | 1806/10830 [00:00<00:19, 464.02 examples/s]
Saving the dataset (7/42 shards):  19%|β–ˆβ–‰        | 2064/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (8/42 shards):  21%|β–ˆβ–ˆβ–       | 2322/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (9/42 shards):  21%|β–ˆβ–ˆβ–       | 2322/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (10/42 shards):  26%|β–ˆβ–ˆβ–Œ       | 2838/10830 [00:00<00:17, 464.02 examples/s]
Saving the dataset (11/42 shards):  29%|β–ˆβ–ˆβ–Š       | 3096/10830 [00:00<00:16, 464.02 examples/s]
Saving the dataset (12/42 shards):  31%|β–ˆβ–ˆβ–ˆ       | 3354/10830 [00:00<00:16, 464.02 examples/s]
Saving the dataset (13/42 shards):  33%|β–ˆβ–ˆβ–ˆβ–Ž      | 3612/10830 [00:00<00:15, 464.02 examples/s]
Saving the dataset (14/42 shards):  33%|β–ˆβ–ˆβ–ˆβ–Ž      | 3612/10830 [00:00<00:15, 464.02 examples/s]
Saving the dataset (15/42 shards):  38%|β–ˆβ–ˆβ–ˆβ–Š      | 4128/10830 [00:00<00:14, 464.02 examples/s]
Saving the dataset (16/42 shards):  40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 4386/10830 [00:00<00:13, 464.02 examples/s]
Saving the dataset (17/42 shards):  40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 4386/10830 [00:00<00:13, 464.02 examples/s]
Saving the dataset (18/42 shards):  45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 4902/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (19/42 shards):  48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 5160/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (20/42 shards):  48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 5160/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (21/42 shards):  52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 5676/10830 [00:00<00:11, 464.02 examples/s]
Saving the dataset (22/42 shards):  52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 5676/10830 [00:00<00:11, 464.02 examples/s]
Saving the dataset (23/42 shards):  55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 5934/10830 [00:00<00:10, 464.02 examples/s]
Saving the dataset (24/42 shards):  57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 6192/10830 [00:00<00:09, 464.02 examples/s]
Saving the dataset (25/42 shards):  64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (26/42 shards):  64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (27/42 shards):  64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (28/42 shards):  67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 7224/10830 [00:00<00:07, 464.02 examples/s]
Saving the dataset (29/42 shards):  74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (30/42 shards):  74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (31/42 shards):  74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (32/42 shards):  79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 8514/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (33/42 shards):  81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 8772/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (34/42 shards):  81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 8772/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (35/42 shards):  83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 9030/10830 [00:00<00:03, 464.02 examples/s]
Saving the dataset (36/42 shards):  88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 9545/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (37/42 shards):  88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 9545/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (38/42 shards):  91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9802/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (39/42 shards):  95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 10316/10830 [00:00<00:01, 464.02 examples/s]
Saving the dataset (40/42 shards):  95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 10316/10830 [00:00<00:01, 464.02 examples/s]
Saving the dataset (41/42 shards):  98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 10573/10830 [00:00<00:00, 464.02 examples/s]
Saving the dataset (42/42 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:00<00:00, 464.02 examples/s]
Saving the dataset (42/42 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10830/10830 [00:00<00:00, 16314.56 examples/s]
[2026-03-16 19:08:54,045] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:213] total_num_tokens: 33_957_071
[2026-03-16 19:08:54,340] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:213] `total_supervised_tokens: 32_028_150`
[2026-03-16 19:08:55,893] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.7050187587738037
[2026-03-16 19:11:05,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2148, 2146, 2148, 2145, 2146, 2146, 2148, 2145]
[2026-03-16 19:11:06,172] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:213] data_loader_len: 268
[2026-03-16 19:11:06,189] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:213] sample_packing_eff_est across ranks: [0.9646614789962769, 0.9657852649688721, 0.9646614789962769, 0.9657852649688721, 0.9648860096931458, 0.9648860096931458, 0.9653354287147522, 0.9657852649688721]
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:213] sample_packing_eff_est: 0.97
[2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:213] total_num_steps: 536
[2026-03-16 19:11:06,192] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:213] Maximum number of steps set at 536
[2026-03-16 19:11:06,242] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:213] loading tokenizer... Qwen/Qwen3-8B
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|>
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None
[2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model
[2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing...
[2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}

Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]
Loading weights: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 399/399 [00:00<00:00, 9671.84it/s]
[2026-03-16 19:11:09,882] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:359] [PID:213] Converting modules to torch.bfloat16
[2026-03-16 19:11:09,885] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:213] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-03-16 19:11:11,696] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:213] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.Multiple deprecation warnings due to FSDP2 conversion:
sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-03-16 19:11:12,192] [INFO] [axolotl.train.save_initial_configs:417] [PID:213] Pre-saving tokenizer to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,283] [INFO] [axolotl.train.save_initial_configs:422] [PID:213] Pre-saving model config to ./outputs/qwen3-sft-stmt-tk/...
[2026-03-16 19:11:12,286] [INFO] [axolotl.train.execute_training:218] [PID:213] Starting trainer...
[2026-03-16 19:11:14,793] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.9547648429870605
[2026-03-16 19:11:14,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2103, 2104, 2104, 2104, 2103, 2104, 2106, 2104]
[2026-03-16 19:11:15,013] [INFO] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:34] [PID:213] Broadcasting full state dict to all ranks...
[2026-03-16 19:11:22,269] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:86] [PID:213] Time taken to load full state dict: 7.26 seconds
[2026-03-16 19:11:22,270] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.log_gpu_memory_usage:127] [PID:213] Memory usage after broadcasting full state dict 3.067GB (+3.067GB allocated, +3.178GB reserved)
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
wandb: Currently logged in as: suxiaole0223 (suxiaole) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run kje10pck
wandb: Tracking run with wandb version 0.25.1
wandb: Run data is saved locally in /workspace/axolotl-workspace/wandb/run-20260316_191122-kje10pck
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run qwen3-8b-tk-run1
wandb: ⭐️ View project at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk
wandb: πŸš€ View run at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk/runs/kje10pck
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-16 19:11:25,554] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:213] The Axolotl config has been saved to the WandB run under files.

  0%|          | 0/536 [00:00<?, ?it/s][2026-03-16 19:11:57,210] [WARNING] [py.warnings._showwarnmsg:110] [PID:213] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
  _warn_once(


  0%|          | 1/536 [00:40<6:03:21, 40.75s/it]
  0%|          | 2/536 [00:43<2:42:00, 18.20s/it]
  1%|          | 3/536 [00:45<1:37:15, 10.95s/it]
  1%|          | 4/536 [00:47<1:07:23,  7.60s/it]
  1%|          | 5/536 [00:50<50:28,  5.70s/it]  
                                               
{'loss': '0.8667', 'grad_norm': '2.609', 'learning_rate': '7.547e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6531', 'tokens/total': 655360, 'tokens/trainable': 611049, 'epoch': '0.01908'}

  1%|          | 5/536 [00:50<50:28,  5.70s/it]
  1%|          | 6/536 [00:52<40:15,  4.56s/it]
  1%|▏         | 7/536 [00:55<34:02,  3.86s/it]
  1%|▏         | 8/536 [00:57<30:00,  3.41s/it]
  2%|▏         | 9/536 [00:59<26:45,  3.05s/it]
  2%|▏         | 10/536 [01:02<24:45,  2.82s/it]
                                                
{'loss': '0.8307', 'grad_norm': '2.5', 'learning_rate': '1.698e-06', 'ppl': '2.295', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6647', 'tokens/total': 1310720, 'tokens/trainable': 1224548, 'epoch': '0.03817'}

  2%|▏         | 10/536 [01:02<24:45,  2.82s/it]
  2%|▏         | 11/536 [01:04<23:13,  2.65s/it]
  2%|▏         | 12/536 [01:06<22:04,  2.53s/it]
  2%|▏         | 13/536 [01:08<21:32,  2.47s/it]
  3%|β–Ž         | 14/536 [01:11<21:27,  2.47s/it]
  3%|β–Ž         | 15/536 [01:13<21:28,  2.47s/it]
                                                
{'loss': '0.8487', 'grad_norm': '2.453', 'learning_rate': '2.642e-06', 'ppl': '2.337', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6160', 'tokens/total': 1966080, 'tokens/trainable': 1834432, 'epoch': '0.05725'}

  3%|β–Ž         | 15/536 [01:13<21:28,  2.47s/it]
  3%|β–Ž         | 16/536 [01:16<21:18,  2.46s/it]
  3%|β–Ž         | 17/536 [01:18<20:51,  2.41s/it]
  3%|β–Ž         | 18/536 [01:20<20:44,  2.40s/it]
  4%|β–Ž         | 19/536 [01:23<21:59,  2.55s/it]
  4%|β–Ž         | 20/536 [01:26<21:40,  2.52s/it]
                                                
{'loss': '0.7713', 'grad_norm': '1.898', 'learning_rate': '3.585e-06', 'ppl': '2.163', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6256', 'tokens/total': 2621440, 'tokens/trainable': 2448388, 'epoch': '0.07634'}

  4%|β–Ž         | 20/536 [01:26<21:40,  2.52s/it]
  4%|▍         | 21/536 [01:28<21:23,  2.49s/it]
  4%|▍         | 22/536 [01:31<20:49,  2.43s/it]
  4%|▍         | 23/536 [01:33<20:37,  2.41s/it]
  4%|▍         | 24/536 [01:35<20:37,  2.42s/it]
  5%|▍         | 25/536 [01:38<20:01,  2.35s/it]
                                                
{'loss': '0.7452', 'grad_norm': '1.273', 'learning_rate': '4.528e-06', 'ppl': '2.107', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6954', 'tokens/total': 3276800, 'tokens/trainable': 3060985, 'epoch': '0.09542'}

  5%|▍         | 25/536 [01:38<20:01,  2.35s/it]
  5%|▍         | 26/536 [01:40<19:41,  2.32s/it]
  5%|β–Œ         | 27/536 [01:42<19:26,  2.29s/it][2026-03-16 19:13:17,483] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-27


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.48s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.48s/it]

  5%|β–Œ         | 28/536 [03:16<4:11:47, 29.74s/it]
  5%|β–Œ         | 29/536 [03:18<3:01:39, 21.50s/it]
  6%|β–Œ         | 30/536 [03:20<2:12:38, 15.73s/it]
                                                  
{'loss': '0.718', 'grad_norm': '0.7695', 'learning_rate': '5.472e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6706', 'tokens/total': 3932160, 'tokens/trainable': 3670695, 'epoch': '0.1145'}

  6%|β–Œ         | 30/536 [03:20<2:12:38, 15.73s/it]
  6%|β–Œ         | 31/536 [03:23<1:38:27, 11.70s/it]
  6%|β–Œ         | 32/536 [03:25<1:14:38,  8.89s/it]
  6%|β–Œ         | 33/536 [03:27<57:48,  6.90s/it]  
  6%|β–‹         | 34/536 [03:29<46:07,  5.51s/it]
  7%|β–‹         | 35/536 [03:32<37:56,  4.54s/it]
                                                
{'loss': '0.6699', 'grad_norm': '0.6406', 'learning_rate': '6.415e-06', 'ppl': '1.954', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6770', 'tokens/total': 4587520, 'tokens/trainable': 4284736, 'epoch': '0.1336'}

  7%|β–‹         | 35/536 [03:32<37:56,  4.54s/it]
  7%|β–‹         | 36/536 [03:34<32:19,  3.88s/it]
  7%|β–‹         | 37/536 [03:37<28:45,  3.46s/it]
  7%|β–‹         | 38/536 [03:39<26:05,  3.14s/it]
  7%|β–‹         | 39/536 [03:41<24:10,  2.92s/it]
  7%|β–‹         | 40/536 [03:44<22:31,  2.72s/it]
                                                
{'loss': '0.6393', 'grad_norm': '0.418', 'learning_rate': '7.358e-06', 'ppl': '1.895', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6668', 'tokens/total': 5242880, 'tokens/trainable': 4896504, 'epoch': '0.1527'}

  7%|β–‹         | 40/536 [03:44<22:31,  2.72s/it]
  8%|β–Š         | 41/536 [03:46<21:24,  2.59s/it]
  8%|β–Š         | 42/536 [03:48<20:36,  2.50s/it]
  8%|β–Š         | 43/536 [03:51<20:06,  2.45s/it]
  8%|β–Š         | 44/536 [03:53<19:38,  2.39s/it]
  8%|β–Š         | 45/536 [03:55<19:17,  2.36s/it]
                                                
{'loss': '0.5953', 'grad_norm': '0.3594', 'learning_rate': '8.302e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 5898240, 'tokens/trainable': 5505933, 'epoch': '0.1718'}

  8%|β–Š         | 45/536 [03:55<19:17,  2.36s/it]
  9%|β–Š         | 46/536 [03:57<19:17,  2.36s/it]
  9%|β–‰         | 47/536 [04:00<19:01,  2.33s/it]
  9%|β–‰         | 48/536 [04:02<19:02,  2.34s/it]
  9%|β–‰         | 49/536 [04:04<19:02,  2.35s/it]
  9%|β–‰         | 50/536 [04:07<18:55,  2.34s/it]
                                                
{'loss': '0.5779', 'grad_norm': '0.332', 'learning_rate': '9.245e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6574', 'tokens/total': 6553600, 'tokens/trainable': 6116643, 'epoch': '0.1908'}

  9%|β–‰         | 50/536 [04:07<18:55,  2.34s/it]
 10%|β–‰         | 51/536 [04:09<18:46,  2.32s/it]
 10%|β–‰         | 52/536 [04:11<18:33,  2.30s/it]
 10%|β–‰         | 53/536 [04:14<18:19,  2.28s/it]
 10%|β–ˆ         | 54/536 [04:16<18:23,  2.29s/it][2026-03-16 19:15:50,860] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-54


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.65s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.65s/it]

 10%|β–ˆ         | 55/536 [05:48<3:55:25, 29.37s/it]
                                                  
{'loss': '0.5579', 'grad_norm': '0.2793', 'learning_rate': '1e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4847', 'tokens/total': 7208960, 'tokens/trainable': 6728061, 'epoch': '0.2099'}

 10%|β–ˆ         | 55/536 [05:48<3:55:25, 29.37s/it]
 10%|β–ˆ         | 56/536 [05:51<2:50:56, 21.37s/it]
 11%|β–ˆ         | 57/536 [05:54<2:05:27, 15.72s/it]
 11%|β–ˆ         | 58/536 [05:56<1:33:09, 11.69s/it]
 11%|β–ˆ         | 59/536 [05:58<1:11:13,  8.96s/it]
 11%|β–ˆ         | 60/536 [06:01<55:01,  6.94s/it]  
                                                
{'loss': '0.5485', 'grad_norm': '0.2773', 'learning_rate': '9.996e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 7864320, 'tokens/trainable': 7336524, 'epoch': '0.229'}

 11%|β–ˆ         | 60/536 [06:01<55:01,  6.94s/it]
 11%|β–ˆβ–        | 61/536 [06:03<43:45,  5.53s/it]
 12%|β–ˆβ–        | 62/536 [06:05<36:05,  4.57s/it]
 12%|β–ˆβ–        | 63/536 [06:08<30:31,  3.87s/it]
 12%|β–ˆβ–        | 64/536 [06:10<26:38,  3.39s/it]
 12%|β–ˆβ–        | 65/536 [06:12<24:01,  3.06s/it]
                                                
{'loss': '0.5385', 'grad_norm': '0.2734', 'learning_rate': '9.987e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6565', 'tokens/total': 8519680, 'tokens/trainable': 7944984, 'epoch': '0.2481'}

 12%|β–ˆβ–        | 65/536 [06:12<24:01,  3.06s/it]
 12%|β–ˆβ–        | 66/536 [06:14<22:09,  2.83s/it]
 12%|β–ˆβ–Ž        | 67/536 [06:17<21:06,  2.70s/it]
 13%|β–ˆβ–Ž        | 68/536 [06:19<20:07,  2.58s/it]
 13%|β–ˆβ–Ž        | 69/536 [06:21<19:21,  2.49s/it]
 13%|β–ˆβ–Ž        | 70/536 [06:24<19:00,  2.45s/it]
                                                
{'loss': '0.5197', 'grad_norm': '0.2578', 'learning_rate': '9.973e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 9175040, 'tokens/trainable': 8556200, 'epoch': '0.2672'}

 13%|β–ˆβ–Ž        | 70/536 [06:24<19:00,  2.45s/it]
 13%|β–ˆβ–Ž        | 71/536 [06:26<18:33,  2.39s/it]
 13%|β–ˆβ–Ž        | 72/536 [06:28<18:14,  2.36s/it]
 14%|β–ˆβ–Ž        | 73/536 [06:31<18:01,  2.34s/it]
 14%|β–ˆβ–        | 74/536 [06:33<18:02,  2.34s/it]
 14%|β–ˆβ–        | 75/536 [06:35<17:54,  2.33s/it]
                                                
{'loss': '0.5316', 'grad_norm': '0.3008', 'learning_rate': '9.953e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6728', 'tokens/total': 9830400, 'tokens/trainable': 9167282, 'epoch': '0.2863'}

 14%|β–ˆβ–        | 75/536 [06:35<17:54,  2.33s/it]
 14%|β–ˆβ–        | 76/536 [06:38<17:54,  2.34s/it]
 14%|β–ˆβ–        | 77/536 [06:40<18:05,  2.37s/it]
 15%|β–ˆβ–        | 78/536 [06:43<18:28,  2.42s/it]
 15%|β–ˆβ–        | 79/536 [06:45<18:05,  2.37s/it]
 15%|β–ˆβ–        | 80/536 [06:47<17:47,  2.34s/it]
                                                
{'loss': '0.5154', 'grad_norm': '0.3164', 'learning_rate': '9.929e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6730', 'tokens/total': 10485760, 'tokens/trainable': 9774908, 'epoch': '0.3053'}

 15%|β–ˆβ–        | 80/536 [06:47<17:47,  2.34s/it]
 15%|β–ˆβ–Œ        | 81/536 [06:49<17:39,  2.33s/it][2026-03-16 19:18:24,375] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-81


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.37s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.37s/it]

 15%|β–ˆβ–Œ        | 82/536 [08:22<3:43:29, 29.54s/it]
 15%|β–ˆβ–Œ        | 83/536 [08:25<2:41:12, 21.35s/it]
 16%|β–ˆβ–Œ        | 84/536 [08:27<1:57:43, 15.63s/it]
 16%|β–ˆβ–Œ        | 85/536 [08:29<1:27:29, 11.64s/it]
                                                  
{'loss': '0.5143', 'grad_norm': '0.2363', 'learning_rate': '9.899e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6604', 'tokens/total': 11141120, 'tokens/trainable': 10388109, 'epoch': '0.3244'}

 16%|β–ˆβ–Œ        | 85/536 [08:29<1:27:29, 11.64s/it]
 16%|β–ˆβ–Œ        | 86/536 [08:32<1:06:16,  8.84s/it]
 16%|β–ˆβ–Œ        | 87/536 [08:34<51:19,  6.86s/it]  
 16%|β–ˆβ–‹        | 88/536 [08:36<40:55,  5.48s/it]
 17%|β–ˆβ–‹        | 89/536 [08:38<33:37,  4.51s/it]
 17%|β–ˆβ–‹        | 90/536 [08:41<28:43,  3.86s/it]
                                                
{'loss': '0.4957', 'grad_norm': '0.2412', 'learning_rate': '9.864e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 11796480, 'tokens/trainable': 10999678, 'epoch': '0.3435'}

 17%|β–ˆβ–‹        | 90/536 [08:41<28:43,  3.86s/it]
 17%|β–ˆβ–‹        | 91/536 [08:43<25:17,  3.41s/it]
 17%|β–ˆβ–‹        | 92/536 [08:45<22:49,  3.08s/it]
 17%|β–ˆβ–‹        | 93/536 [08:48<21:12,  2.87s/it]
 18%|β–ˆβ–Š        | 94/536 [08:50<19:44,  2.68s/it]
 18%|β–ˆβ–Š        | 95/536 [08:52<19:22,  2.64s/it]
                                                
{'loss': '0.509', 'grad_norm': '0.2236', 'learning_rate': '9.823e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5974', 'tokens/total': 12451840, 'tokens/trainable': 11609345, 'epoch': '0.3626'}

 18%|β–ˆβ–Š        | 95/536 [08:52<19:22,  2.64s/it]
 18%|β–ˆβ–Š        | 96/536 [08:55<18:35,  2.54s/it]
 18%|β–ˆβ–Š        | 97/536 [08:57<18:01,  2.46s/it]
 18%|β–ˆβ–Š        | 98/536 [09:00<19:09,  2.62s/it]
 18%|β–ˆβ–Š        | 99/536 [09:03<19:00,  2.61s/it]
 19%|β–ˆβ–Š        | 100/536 [09:05<18:16,  2.51s/it]
                                                 
{'loss': '0.4925', 'grad_norm': '0.2451', 'learning_rate': '9.778e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6696', 'tokens/total': 13107200, 'tokens/trainable': 12218448, 'epoch': '0.3817'}

 19%|β–ˆβ–Š        | 100/536 [09:05<18:16,  2.51s/it]
 19%|β–ˆβ–‰        | 101/536 [09:07<17:50,  2.46s/it]
 19%|β–ˆβ–‰        | 102/536 [09:09<17:19,  2.40s/it]
 19%|β–ˆβ–‰        | 103/536 [09:12<16:59,  2.35s/it]
 19%|β–ˆβ–‰        | 104/536 [09:14<16:47,  2.33s/it]
 20%|β–ˆβ–‰        | 105/536 [09:16<16:34,  2.31s/it]
                                                 
{'loss': '0.5051', 'grad_norm': '0.25', 'learning_rate': '9.727e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6724', 'tokens/total': 13762560, 'tokens/trainable': 12826468, 'epoch': '0.4008'}

 20%|β–ˆβ–‰        | 105/536 [09:16<16:34,  2.31s/it]
 20%|β–ˆβ–‰        | 106/536 [09:19<16:28,  2.30s/it]
 20%|β–ˆβ–‰        | 107/536 [09:21<16:26,  2.30s/it]
 20%|β–ˆβ–ˆ        | 108/536 [09:23<16:27,  2.31s/it][2026-03-16 19:20:58,221] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-108


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.13s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.13s/it]

 20%|β–ˆβ–ˆ        | 109/536 [11:03<3:44:22, 31.53s/it]
 21%|β–ˆβ–ˆ        | 110/536 [11:05<2:41:43, 22.78s/it]
                                                   
{'loss': '0.4725', 'grad_norm': '0.2266', 'learning_rate': '9.672e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6506', 'tokens/total': 14417920, 'tokens/trainable': 13440042, 'epoch': '0.4198'}

 21%|β–ˆβ–ˆ        | 110/536 [11:05<2:41:43, 22.78s/it]
 21%|β–ˆβ–ˆ        | 111/536 [11:07<1:57:41, 16.61s/it]
 21%|β–ˆβ–ˆ        | 112/536 [11:10<1:26:54, 12.30s/it]
 21%|β–ˆβ–ˆ        | 113/536 [11:12<1:05:42,  9.32s/it]
 21%|β–ˆβ–ˆβ–       | 114/536 [11:14<50:52,  7.23s/it]  
 21%|β–ˆβ–ˆβ–       | 115/536 [11:17<40:26,  5.76s/it]
                                                 
{'loss': '0.5004', 'grad_norm': '0.2256', 'learning_rate': '9.612e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6525', 'tokens/total': 15073280, 'tokens/trainable': 14049913, 'epoch': '0.4389'}

 21%|β–ˆβ–ˆβ–       | 115/536 [11:17<40:26,  5.76s/it]
 22%|β–ˆβ–ˆβ–       | 116/536 [11:19<33:04,  4.72s/it]
 22%|β–ˆβ–ˆβ–       | 117/536 [11:22<28:43,  4.11s/it]
 22%|β–ˆβ–ˆβ–       | 118/536 [11:24<24:56,  3.58s/it]
 22%|β–ˆβ–ˆβ–       | 119/536 [11:26<22:09,  3.19s/it]
 22%|β–ˆβ–ˆβ–       | 120/536 [11:29<20:22,  2.94s/it]
                                                 
{'loss': '0.4727', 'grad_norm': '0.248', 'learning_rate': '9.546e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6422', 'tokens/total': 15728640, 'tokens/trainable': 14657396, 'epoch': '0.458'}

 22%|β–ˆβ–ˆβ–       | 120/536 [11:29<20:22,  2.94s/it]
 23%|β–ˆβ–ˆβ–Ž       | 121/536 [11:31<19:02,  2.75s/it]
 23%|β–ˆβ–ˆβ–Ž       | 122/536 [11:33<18:01,  2.61s/it]
 23%|β–ˆβ–ˆβ–Ž       | 123/536 [11:36<17:25,  2.53s/it]
 23%|β–ˆβ–ˆβ–Ž       | 124/536 [11:38<17:02,  2.48s/it]
 23%|β–ˆβ–ˆβ–Ž       | 125/536 [11:40<16:31,  2.41s/it]
                                                 
{'loss': '0.4808', 'grad_norm': '0.2344', 'learning_rate': '9.476e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6830', 'tokens/total': 16384000, 'tokens/trainable': 15266794, 'epoch': '0.4771'}

 23%|β–ˆβ–ˆβ–Ž       | 125/536 [11:40<16:31,  2.41s/it]
 24%|β–ˆβ–ˆβ–Ž       | 126/536 [11:43<16:21,  2.39s/it]
 24%|β–ˆβ–ˆβ–Ž       | 127/536 [11:45<16:21,  2.40s/it]
 24%|β–ˆβ–ˆβ–       | 128/536 [11:47<16:06,  2.37s/it]
 24%|β–ˆβ–ˆβ–       | 129/536 [11:50<15:59,  2.36s/it]
 24%|β–ˆβ–ˆβ–       | 130/536 [11:52<15:57,  2.36s/it]
                                                 
{'loss': '0.4726', 'grad_norm': '0.2451', 'learning_rate': '9.401e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 17039360, 'tokens/trainable': 15876387, 'epoch': '0.4962'}

 24%|β–ˆβ–ˆβ–       | 130/536 [11:52<15:57,  2.36s/it]
 24%|β–ˆβ–ˆβ–       | 131/536 [11:54<15:48,  2.34s/it]
 25%|β–ˆβ–ˆβ–       | 132/536 [11:57<15:37,  2.32s/it]
 25%|β–ˆβ–ˆβ–       | 133/536 [11:59<15:31,  2.31s/it]
 25%|β–ˆβ–ˆβ–Œ       | 134/536 [12:01<15:54,  2.37s/it]
 25%|β–ˆβ–ˆβ–Œ       | 135/536 [12:04<16:09,  2.42s/it]
                                                 
{'loss': '0.4864', 'grad_norm': '0.2344', 'learning_rate': '9.322e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6056', 'tokens/total': 17694720, 'tokens/trainable': 16486440, 'epoch': '0.5153'}

 25%|β–ˆβ–ˆβ–Œ       | 135/536 [12:04<16:09,  2.42s/it][2026-03-16 19:23:38,988] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-135


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.41s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.41s/it]

 25%|β–ˆβ–ˆβ–Œ       | 136/536 [13:41<3:26:11, 30.93s/it]
 26%|β–ˆβ–ˆβ–Œ       | 137/536 [13:45<2:30:52, 22.69s/it]
 26%|β–ˆβ–ˆβ–Œ       | 138/536 [13:47<1:49:51, 16.56s/it]
 26%|β–ˆβ–ˆβ–Œ       | 139/536 [13:49<1:21:09, 12.27s/it]
 26%|β–ˆβ–ˆβ–Œ       | 140/536 [13:52<1:01:08,  9.26s/it]
                                                   
{'loss': '0.4817', 'grad_norm': '0.2275', 'learning_rate': '9.238e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6712', 'tokens/total': 18350080, 'tokens/trainable': 17095060, 'epoch': '0.5344'}

 26%|β–ˆβ–ˆβ–Œ       | 140/536 [13:52<1:01:08,  9.26s/it]
 26%|β–ˆβ–ˆβ–‹       | 141/536 [13:54<47:09,  7.16s/it]  
 26%|β–ˆβ–ˆβ–‹       | 142/536 [13:56<37:27,  5.70s/it]
 27%|β–ˆβ–ˆβ–‹       | 143/536 [13:58<30:36,  4.67s/it]
 27%|β–ˆβ–ˆβ–‹       | 144/536 [14:01<25:57,  3.97s/it]
 27%|β–ˆβ–ˆβ–‹       | 145/536 [14:03<22:36,  3.47s/it]
                                                 
{'loss': '0.4827', 'grad_norm': '0.249', 'learning_rate': '9.149e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6633', 'tokens/total': 19005440, 'tokens/trainable': 17703368, 'epoch': '0.5534'}

 27%|β–ˆβ–ˆβ–‹       | 145/536 [14:03<22:36,  3.47s/it]
 27%|β–ˆβ–ˆβ–‹       | 146/536 [14:05<20:19,  3.13s/it]
 27%|β–ˆβ–ˆβ–‹       | 147/536 [14:08<18:42,  2.89s/it]
 28%|β–ˆβ–ˆβ–Š       | 148/536 [14:10<17:44,  2.74s/it]
 28%|β–ˆβ–ˆβ–Š       | 149/536 [14:12<16:47,  2.60s/it]
 28%|β–ˆβ–ˆβ–Š       | 150/536 [14:15<16:15,  2.53s/it]
                                                 
{'loss': '0.4892', 'grad_norm': '0.2217', 'learning_rate': '9.057e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6466', 'tokens/total': 19660800, 'tokens/trainable': 18311084, 'epoch': '0.5725'}

 28%|β–ˆβ–ˆβ–Š       | 150/536 [14:15<16:15,  2.53s/it]
 28%|β–ˆβ–ˆβ–Š       | 151/536 [14:17<15:49,  2.47s/it]
 28%|β–ˆβ–ˆβ–Š       | 152/536 [14:20<15:51,  2.48s/it]
 29%|β–ˆβ–ˆβ–Š       | 153/536 [14:22<16:18,  2.55s/it]
 29%|β–ˆβ–ˆβ–Š       | 154/536 [14:25<16:18,  2.56s/it]
 29%|β–ˆβ–ˆβ–‰       | 155/536 [14:27<16:04,  2.53s/it]
                                                 
{'loss': '0.4618', 'grad_norm': '0.2236', 'learning_rate': '8.959e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6104', 'tokens/total': 20316160, 'tokens/trainable': 18920000, 'epoch': '0.5916'}

 29%|β–ˆβ–ˆβ–‰       | 155/536 [14:27<16:04,  2.53s/it]
 29%|β–ˆβ–ˆβ–‰       | 156/536 [14:30<15:36,  2.47s/it]
 29%|β–ˆβ–ˆβ–‰       | 157/536 [14:32<15:16,  2.42s/it]
 29%|β–ˆβ–ˆβ–‰       | 158/536 [14:35<15:24,  2.45s/it]
 30%|β–ˆβ–ˆβ–‰       | 159/536 [14:37<15:05,  2.40s/it]
 30%|β–ˆβ–ˆβ–‰       | 160/536 [14:39<14:54,  2.38s/it]
                                                 
{'loss': '0.471', 'grad_norm': '0.2793', 'learning_rate': '8.858e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6516', 'tokens/total': 20971520, 'tokens/trainable': 19529720, 'epoch': '0.6107'}

 30%|β–ˆβ–ˆβ–‰       | 160/536 [14:39<14:54,  2.38s/it]
 30%|β–ˆβ–ˆβ–ˆ       | 161/536 [14:41<14:48,  2.37s/it]
 30%|β–ˆβ–ˆβ–ˆ       | 162/536 [14:44<14:28,  2.32s/it][2026-03-16 19:26:18,649] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-162


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]

 30%|β–ˆβ–ˆβ–ˆ       | 163/536 [16:21<3:11:06, 30.74s/it]
 31%|β–ˆβ–ˆβ–ˆ       | 164/536 [16:23<2:17:38, 22.20s/it]
 31%|β–ˆβ–ˆβ–ˆ       | 165/536 [16:25<1:40:18, 16.22s/it]
                                                   
{'loss': '0.4703', 'grad_norm': '0.2383', 'learning_rate': '8.752e-06', 'ppl': '1.6', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6785', 'tokens/total': 21626880, 'tokens/trainable': 20137712, 'epoch': '0.6298'}

 31%|β–ˆβ–ˆβ–ˆ       | 165/536 [16:25<1:40:18, 16.22s/it]
 31%|β–ˆβ–ˆβ–ˆ       | 166/536 [16:28<1:14:13, 12.04s/it]
 31%|β–ˆβ–ˆβ–ˆ       | 167/536 [16:30<56:04,  9.12s/it]  
 31%|β–ˆβ–ˆβ–ˆβ–      | 168/536 [16:32<43:22,  7.07s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 169/536 [16:34<34:28,  5.64s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 170/536 [16:37<28:13,  4.63s/it]
                                                 
{'loss': '0.4727', 'grad_norm': '0.2139', 'learning_rate': '8.643e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6694', 'tokens/total': 22282240, 'tokens/trainable': 20749040, 'epoch': '0.6489'}

 32%|β–ˆβ–ˆβ–ˆβ–      | 170/536 [16:37<28:13,  4.63s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 171/536 [16:39<24:07,  3.96s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 172/536 [16:42<21:19,  3.51s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 173/536 [16:44<19:34,  3.24s/it]
 32%|β–ˆβ–ˆβ–ˆβ–      | 174/536 [16:46<17:46,  2.95s/it]
 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 175/536 [16:49<16:24,  2.73s/it]
                                                 
{'loss': '0.4856', 'grad_norm': '0.2119', 'learning_rate': '8.53e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6812', 'tokens/total': 22937600, 'tokens/trainable': 21358216, 'epoch': '0.6679'}

 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 175/536 [16:49<16:24,  2.73s/it]
 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 176/536 [16:51<15:32,  2.59s/it]
 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 177/536 [16:54<16:38,  2.78s/it]
 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 178/536 [16:57<15:52,  2.66s/it]
 33%|β–ˆβ–ˆβ–ˆβ–Ž      | 179/536 [16:59<15:07,  2.54s/it]
 34%|β–ˆβ–ˆβ–ˆβ–Ž      | 180/536 [17:01<14:47,  2.49s/it]
                                                 
{'loss': '0.4551', 'grad_norm': '0.2266', 'learning_rate': '8.413e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6375', 'tokens/total': 23592960, 'tokens/trainable': 21963408, 'epoch': '0.687'}

 34%|β–ˆβ–ˆβ–ˆβ–Ž      | 180/536 [17:01<14:47,  2.49s/it]
 34%|β–ˆβ–ˆβ–ˆβ–      | 181/536 [17:04<14:25,  2.44s/it]
 34%|β–ˆβ–ˆβ–ˆβ–      | 182/536 [17:06<14:08,  2.40s/it]
 34%|β–ˆβ–ˆβ–ˆβ–      | 183/536 [17:08<13:47,  2.34s/it]
 34%|β–ˆβ–ˆβ–ˆβ–      | 184/536 [17:10<13:42,  2.34s/it]
 35%|β–ˆβ–ˆβ–ˆβ–      | 185/536 [17:13<13:34,  2.32s/it]
                                                 
{'loss': '0.4654', 'grad_norm': '0.2695', 'learning_rate': '8.292e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6688', 'tokens/total': 24248320, 'tokens/trainable': 22570984, 'epoch': '0.7061'}

 35%|β–ˆβ–ˆβ–ˆβ–      | 185/536 [17:13<13:34,  2.32s/it]
 35%|β–ˆβ–ˆβ–ˆβ–      | 186/536 [17:15<13:21,  2.29s/it]
 35%|β–ˆβ–ˆβ–ˆβ–      | 187/536 [17:17<13:19,  2.29s/it]
 35%|β–ˆβ–ˆβ–ˆβ–Œ      | 188/536 [17:19<13:11,  2.28s/it]
 35%|β–ˆβ–ˆβ–ˆβ–Œ      | 189/536 [17:22<13:08,  2.27s/it][2026-03-16 19:28:56,617] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-189


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.04s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.04s/it]

 35%|β–ˆβ–ˆβ–ˆβ–Œ      | 190/536 [19:01<3:00:49, 31.36s/it]
                                                   
{'loss': '0.4727', 'grad_norm': '0.2285', 'learning_rate': '8.168e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4795', 'tokens/total': 24903680, 'tokens/trainable': 23180680, 'epoch': '0.7252'}

 35%|β–ˆβ–ˆβ–ˆβ–Œ      | 190/536 [19:01<3:00:49, 31.36s/it]
 36%|β–ˆβ–ˆβ–ˆβ–Œ      | 191/536 [19:03<2:10:15, 22.65s/it]
 36%|β–ˆβ–ˆβ–ˆβ–Œ      | 192/536 [19:06<1:35:23, 16.64s/it]
 36%|β–ˆβ–ˆβ–ˆβ–Œ      | 193/536 [19:08<1:11:08, 12.44s/it]
 36%|β–ˆβ–ˆβ–ˆβ–Œ      | 194/536 [19:11<53:42,  9.42s/it]  
 36%|β–ˆβ–ˆβ–ˆβ–‹      | 195/536 [19:13<41:27,  7.30s/it]
                                                 
{'loss': '0.462', 'grad_norm': '0.2158', 'learning_rate': '8.041e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6567', 'tokens/total': 25559040, 'tokens/trainable': 23790730, 'epoch': '0.7443'}

 36%|β–ˆβ–ˆβ–ˆβ–‹      | 195/536 [19:13<41:27,  7.30s/it]
 37%|β–ˆβ–ˆβ–ˆβ–‹      | 196/536 [19:16<32:54,  5.81s/it]
 37%|β–ˆβ–ˆβ–ˆβ–‹      | 197/536 [19:18<27:50,  4.93s/it]
 37%|β–ˆβ–ˆβ–ˆβ–‹      | 198/536 [19:21<23:17,  4.13s/it]
 37%|β–ˆβ–ˆβ–ˆβ–‹      | 199/536 [19:23<20:04,  3.57s/it]
 37%|β–ˆβ–ˆβ–ˆβ–‹      | 200/536 [19:25<17:43,  3.17s/it]
                                                 
{'loss': '0.4676', 'grad_norm': '0.2188', 'learning_rate': '7.91e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6880', 'tokens/total': 26214400, 'tokens/trainable': 24401252, 'epoch': '0.7634'}

 37%|β–ˆβ–ˆβ–ˆβ–‹      | 200/536 [19:25<17:43,  3.17s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 201/536 [19:27<16:11,  2.90s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 202/536 [19:30<15:05,  2.71s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 203/536 [19:32<14:15,  2.57s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 204/536 [19:34<13:41,  2.47s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 205/536 [19:37<13:22,  2.43s/it]
                                                 
{'loss': '0.4504', 'grad_norm': '0.2158', 'learning_rate': '7.776e-06', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6617', 'tokens/total': 26869760, 'tokens/trainable': 25010696, 'epoch': '0.7824'}

 38%|β–ˆβ–ˆβ–ˆβ–Š      | 205/536 [19:37<13:22,  2.43s/it]
 38%|β–ˆβ–ˆβ–ˆβ–Š      | 206/536 [19:39<13:08,  2.39s/it]
 39%|β–ˆβ–ˆβ–ˆβ–Š      | 207/536 [19:41<12:51,  2.34s/it]
 39%|β–ˆβ–ˆβ–ˆβ–‰      | 208/536 [19:43<12:41,  2.32s/it]
 39%|β–ˆβ–ˆβ–ˆβ–‰      | 209/536 [19:46<12:40,  2.32s/it]
 39%|β–ˆβ–ˆβ–ˆβ–‰      | 210/536 [19:48<13:00,  2.39s/it]
                                                 
{'loss': '0.4614', 'grad_norm': '0.2295', 'learning_rate': '7.639e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5994', 'tokens/total': 27525120, 'tokens/trainable': 25617872, 'epoch': '0.8015'}

 39%|β–ˆβ–ˆβ–ˆβ–‰      | 210/536 [19:48<13:00,  2.39s/it]
 39%|β–ˆβ–ˆβ–ˆβ–‰      | 211/536 [19:51<13:14,  2.44s/it]
 40%|β–ˆβ–ˆβ–ˆβ–‰      | 212/536 [19:53<12:58,  2.40s/it]
 40%|β–ˆβ–ˆβ–ˆβ–‰      | 213/536 [19:55<12:42,  2.36s/it]
 40%|β–ˆβ–ˆβ–ˆβ–‰      | 214/536 [19:58<12:29,  2.33s/it]
 40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 215/536 [20:00<12:22,  2.31s/it]
                                                 
{'loss': '0.477', 'grad_norm': '0.2412', 'learning_rate': '7.5e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6697', 'tokens/total': 28180480, 'tokens/trainable': 26227438, 'epoch': '0.8206'}

 40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 215/536 [20:00<12:22,  2.31s/it]
 40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 216/536 [20:02<12:22,  2.32s/it][2026-03-16 19:31:37,309] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-216


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.85s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.85s/it]

 40%|β–ˆβ–ˆβ–ˆβ–ˆ      | 217/536 [21:40<2:45:11, 31.07s/it]
 41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 218/536 [21:43<1:59:15, 22.50s/it]
 41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 219/536 [21:45<1:26:49, 16.43s/it]
 41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 220/536 [21:47<1:04:08, 12.18s/it]
                                                   
{'loss': '0.4535', 'grad_norm': '0.2148', 'learning_rate': '7.358e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 28835840, 'tokens/trainable': 26833456, 'epoch': '0.8397'}

 41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 220/536 [21:47<1:04:08, 12.18s/it]
 41%|β–ˆβ–ˆβ–ˆβ–ˆ      | 221/536 [21:50<48:25,  9.22s/it]  
 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 222/536 [21:52<37:25,  7.15s/it]
 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 223/536 [21:54<29:44,  5.70s/it]
 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 224/536 [21:57<24:17,  4.67s/it]
 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 225/536 [21:59<20:30,  3.96s/it]
                                                 
{'loss': '0.4639', 'grad_norm': '0.2197', 'learning_rate': '7.213e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 29491200, 'tokens/trainable': 27444416, 'epoch': '0.8588'}

 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 225/536 [21:59<20:30,  3.96s/it]
 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 226/536 [22:01<17:50,  3.45s/it]
 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 227/536 [22:03<15:56,  3.10s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 228/536 [22:06<14:40,  2.86s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 229/536 [22:08<14:24,  2.82s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 230/536 [22:11<14:01,  2.75s/it]
                                                 
{'loss': '0.4578', 'grad_norm': '0.2217', 'learning_rate': '7.066e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5816', 'tokens/total': 30146560, 'tokens/trainable': 28048432, 'epoch': '0.8779'}

 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 230/536 [22:11<14:01,  2.75s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 231/536 [22:13<13:21,  2.63s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 232/536 [22:16<12:45,  2.52s/it]
 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 233/536 [22:18<12:29,  2.47s/it]
 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž     | 234/536 [22:20<12:15,  2.44s/it]
 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 235/536 [22:23<11:59,  2.39s/it]
                                                 
{'loss': '0.4497', 'grad_norm': '0.2354', 'learning_rate': '6.917e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6580', 'tokens/total': 30801920, 'tokens/trainable': 28655952, 'epoch': '0.8969'}

 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 235/536 [22:23<11:59,  2.39s/it]
 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 236/536 [22:25<11:55,  2.39s/it]
 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 237/536 [22:27<11:47,  2.37s/it]
 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 238/536 [22:30<11:38,  2.34s/it]
 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 239/536 [22:32<11:38,  2.35s/it]
 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 240/536 [22:34<11:23,  2.31s/it]
                                                 
{'loss': '0.4693', 'grad_norm': '0.2275', 'learning_rate': '6.766e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 31457280, 'tokens/trainable': 29262050, 'epoch': '0.916'}

 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 240/536 [22:34<11:23,  2.31s/it]
 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–     | 241/536 [22:37<11:19,  2.30s/it]
 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 242/536 [22:39<11:12,  2.29s/it]
 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 243/536 [22:41<11:10,  2.29s/it][2026-03-16 19:34:16,197] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-243


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.81s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]

 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 244/536 [24:21<2:33:31, 31.55s/it]
 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 245/536 [24:23<1:50:23, 22.76s/it]
                                                   
{'loss': '0.4629', 'grad_norm': '0.2178', 'learning_rate': '6.613e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 32112640, 'tokens/trainable': 29868356, 'epoch': '0.9351'}

 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 245/536 [24:23<1:50:23, 22.76s/it]
 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 246/536 [24:25<1:20:19, 16.62s/it]
 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ     | 247/536 [24:28<59:24, 12.33s/it]  
 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 248/536 [24:30<45:07,  9.40s/it]
 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 249/536 [24:33<34:46,  7.27s/it]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 250/536 [24:35<27:30,  5.77s/it]
                                                 
{'loss': '0.474', 'grad_norm': '0.2539', 'learning_rate': '6.458e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6714', 'tokens/total': 32768000, 'tokens/trainable': 30473100, 'epoch': '0.9542'}

 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 250/536 [24:35<27:30,  5.77s/it]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 251/536 [24:37<22:25,  4.72s/it]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 252/536 [24:39<18:50,  3.98s/it]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 253/536 [24:42<16:27,  3.49s/it]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹     | 254/536 [24:44<14:46,  3.14s/it]
 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 255/536 [24:46<13:26,  2.87s/it]
                                                 
{'loss': '0.467', 'grad_norm': '0.2305', 'learning_rate': '6.302e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6772', 'tokens/total': 33423360, 'tokens/trainable': 31078478, 'epoch': '0.9733'}

 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 255/536 [24:46<13:26,  2.87s/it]
 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 256/536 [24:49<12:32,  2.69s/it]
 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 257/536 [24:51<11:59,  2.58s/it]
 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 258/536 [24:53<11:39,  2.52s/it]
 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 259/536 [24:56<11:19,  2.45s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 260/536 [24:58<11:02,  2.40s/it]
                                                 
{'loss': '0.4511', 'grad_norm': '0.2148', 'learning_rate': '6.144e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6619', 'tokens/total': 34078720, 'tokens/trainable': 31682612, 'epoch': '0.9924'}

 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 260/536 [24:58<11:02,  2.40s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š     | 261/536 [25:00<10:50,  2.36s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 262/536 [25:03<10:56,  2.40s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 263/536 [25:06<12:20,  2.71s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 264/536 [25:08<11:41,  2.58s/it]
 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 265/536 [25:11<11:22,  2.52s/it]
                                                 
{'loss': '0.4682', 'grad_norm': '0.2451', 'learning_rate': '5.985e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6472', 'tokens/total': 34734080, 'tokens/trainable': 32293470, 'epoch': '1.011'}

 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 265/536 [25:11<11:22,  2.52s/it]
 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 266/536 [25:13<11:14,  2.50s/it]
 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰     | 267/536 [25:16<11:28,  2.56s/it]
 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 268/536 [25:18<11:16,  2.52s/it]
 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 269/536 [25:21<10:53,  2.45s/it]
 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 270/536 [25:23<10:39,  2.40s/it]
                                                 
{'loss': '0.461', 'grad_norm': '0.2207', 'learning_rate': '5.826e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6685', 'tokens/total': 35389440, 'tokens/trainable': 32904464, 'epoch': '1.031'}

 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 270/536 [25:23<10:39,  2.40s/it][2026-03-16 19:36:59,256] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-270


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.30s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.30s/it]

 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 271/536 [27:02<2:18:44, 31.41s/it]
 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 272/536 [27:04<1:39:50, 22.69s/it]
 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 273/536 [27:07<1:12:35, 16.56s/it]
 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 274/536 [27:09<53:33, 12.26s/it]  
 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 275/536 [27:11<40:23,  9.29s/it]
                                                 
{'loss': '0.4545', 'grad_norm': '0.2324', 'learning_rate': '5.665e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6568', 'tokens/total': 36044800, 'tokens/trainable': 33517832, 'epoch': '1.05'}

 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 275/536 [27:11<40:23,  9.29s/it]
 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 276/536 [27:15<32:37,  7.53s/it]
 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 277/536 [27:17<25:45,  5.97s/it]
 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 278/536 [27:19<20:59,  4.88s/it]
 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 279/536 [27:22<17:35,  4.11s/it]
 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 280/536 [27:24<15:12,  3.56s/it]
                                                 
{'loss': '0.447', 'grad_norm': '0.2158', 'learning_rate': '5.503e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 36700160, 'tokens/trainable': 34129632, 'epoch': '1.069'}

 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 280/536 [27:24<15:12,  3.56s/it]
 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 281/536 [27:26<13:35,  3.20s/it]
 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 282/536 [27:28<12:20,  2.91s/it]
 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 283/536 [27:31<11:30,  2.73s/it]
 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 284/536 [27:33<10:54,  2.60s/it]
 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 285/536 [27:36<10:51,  2.60s/it]
                                                 
{'loss': '0.4378', 'grad_norm': '0.2119', 'learning_rate': '5.341e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5854', 'tokens/total': 37355520, 'tokens/trainable': 34742888, 'epoch': '1.088'}

 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 285/536 [27:36<10:51,  2.60s/it]
 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 286/536 [27:38<10:42,  2.57s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 287/536 [27:41<10:26,  2.51s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž    | 288/536 [27:43<10:02,  2.43s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 289/536 [27:45<09:45,  2.37s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 290/536 [27:47<09:40,  2.36s/it]
                                                 
{'loss': '0.4756', 'grad_norm': '0.2246', 'learning_rate': '5.179e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6534', 'tokens/total': 38010880, 'tokens/trainable': 35352848, 'epoch': '1.107'}

 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 290/536 [27:47<09:40,  2.36s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 291/536 [27:50<09:33,  2.34s/it]
 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 292/536 [27:52<09:27,  2.33s/it]
 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 293/536 [27:54<09:23,  2.32s/it]
 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–    | 294/536 [27:57<09:23,  2.33s/it]
 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 295/536 [27:59<09:21,  2.33s/it]
                                                 
{'loss': '0.4635', 'grad_norm': '0.2188', 'learning_rate': '5.016e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6541', 'tokens/total': 38666240, 'tokens/trainable': 35964736, 'epoch': '1.126'}

 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 295/536 [27:59<09:21,  2.33s/it]
 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 296/536 [28:01<09:16,  2.32s/it]
 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 297/536 [28:03<09:09,  2.30s/it][2026-03-16 19:39:38,467] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-297


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]

 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 298/536 [29:43<2:05:14, 31.57s/it]
 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 299/536 [29:46<1:30:06, 22.81s/it]
 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 300/536 [29:48<1:05:28, 16.65s/it]
                                                   
{'loss': '0.4578', 'grad_norm': '0.2334', 'learning_rate': '4.854e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6804', 'tokens/total': 39321600, 'tokens/trainable': 36579308, 'epoch': '1.145'}

 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 300/536 [29:48<1:05:28, 16.65s/it]
 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ    | 301/536 [29:50<48:19, 12.34s/it]  
 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 302/536 [29:53<36:18,  9.31s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 303/536 [29:55<27:59,  7.21s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 304/536 [29:57<22:22,  5.78s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 305/536 [30:00<18:50,  4.89s/it]
                                                 
{'loss': '0.4526', 'grad_norm': '0.2129', 'learning_rate': '4.691e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6214', 'tokens/total': 39976960, 'tokens/trainable': 37187912, 'epoch': '1.164'}

 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 305/536 [30:00<18:50,  4.89s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 306/536 [30:02<15:44,  4.11s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 307/536 [30:05<13:31,  3.55s/it]
 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹    | 308/536 [30:07<12:06,  3.19s/it]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 309/536 [30:09<11:00,  2.91s/it]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 310/536 [30:12<10:18,  2.74s/it]
                                                 
{'loss': '0.4482', 'grad_norm': '0.21', 'learning_rate': '4.529e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6526', 'tokens/total': 40632320, 'tokens/trainable': 37799984, 'epoch': '1.183'}

 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 310/536 [30:12<10:18,  2.74s/it]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 311/536 [30:14<09:49,  2.62s/it]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 312/536 [30:16<09:25,  2.53s/it]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 313/536 [30:18<09:07,  2.45s/it]
 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š    | 314/536 [30:21<09:33,  2.58s/it]
 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 315/536 [30:24<09:07,  2.48s/it]
                                                 
{'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '4.368e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6899', 'tokens/total': 41287680, 'tokens/trainable': 38409832, 'epoch': '1.202'}

 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 315/536 [30:24<09:07,  2.48s/it]
 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 316/536 [30:26<08:55,  2.43s/it]
 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 317/536 [30:28<08:44,  2.39s/it]
 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 318/536 [30:31<08:35,  2.36s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 319/536 [30:33<08:25,  2.33s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 320/536 [30:35<08:19,  2.31s/it]
                                                 
{'loss': '0.4539', 'grad_norm': '0.2285', 'learning_rate': '4.207e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6757', 'tokens/total': 41943040, 'tokens/trainable': 39020096, 'epoch': '1.221'}

 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 320/536 [30:35<08:19,  2.31s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰    | 321/536 [30:37<08:17,  2.31s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 322/536 [30:40<08:09,  2.29s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 323/536 [30:42<08:19,  2.35s/it]
 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 324/536 [30:45<08:26,  2.39s/it][2026-03-16 19:42:20,100] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-324


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.49s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.49s/it]

 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 325/536 [32:22<1:48:54, 30.97s/it]
                                                   
{'loss': '0.4481', 'grad_norm': '0.2246', 'learning_rate': '4.046e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6887', 'tokens/total': 42598400, 'tokens/trainable': 39629160, 'epoch': '1.24'}

 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 325/536 [32:22<1:48:54, 30.97s/it]
 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 326/536 [32:24<1:18:13, 22.35s/it]
 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 327/536 [32:27<56:54, 16.34s/it]  
 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    | 328/536 [32:29<42:00, 12.12s/it]
 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 329/536 [32:31<31:43,  9.20s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 330/536 [32:34<24:27,  7.12s/it]
                                                 
{'loss': '0.4542', 'grad_norm': '0.2256', 'learning_rate': '3.887e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6731', 'tokens/total': 43253760, 'tokens/trainable': 40237288, 'epoch': '1.26'}

 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 330/536 [32:34<24:27,  7.12s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 331/536 [32:36<19:21,  5.67s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 332/536 [32:39<16:57,  4.99s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 333/536 [32:42<14:06,  4.17s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 334/536 [32:44<12:07,  3.60s/it]
 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 335/536 [32:46<10:43,  3.20s/it]
                                                 
{'loss': '0.4412', 'grad_norm': '0.2539', 'learning_rate': '3.729e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6745', 'tokens/total': 43909120, 'tokens/trainable': 40848032, 'epoch': '1.279'}

 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 335/536 [32:46<10:43,  3.20s/it]
 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 336/536 [32:48<09:46,  2.93s/it]
 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 337/536 [32:51<09:05,  2.74s/it]
 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 338/536 [32:53<08:37,  2.61s/it]
 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 339/536 [32:55<08:14,  2.51s/it]
 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 340/536 [32:58<08:04,  2.47s/it]
                                                 
{'loss': '0.4615', 'grad_norm': '0.2217', 'learning_rate': '3.573e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6370', 'tokens/total': 44564480, 'tokens/trainable': 41457624, 'epoch': '1.298'}

 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 340/536 [32:58<08:04,  2.47s/it]
 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž   | 341/536 [33:00<08:05,  2.49s/it]
 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 342/536 [33:03<07:51,  2.43s/it]
 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 343/536 [33:05<07:49,  2.43s/it]
 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 344/536 [33:08<07:51,  2.46s/it]
 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 345/536 [33:10<07:38,  2.40s/it]
                                                 
{'loss': '0.4599', 'grad_norm': '0.2188', 'learning_rate': '3.418e-06', 'ppl': '1.584', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6795', 'tokens/total': 45219840, 'tokens/trainable': 42069272, 'epoch': '1.317'}

 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 345/536 [33:10<07:38,  2.40s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 346/536 [33:12<07:29,  2.37s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 347/536 [33:14<07:26,  2.36s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–   | 348/536 [33:17<07:20,  2.34s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 349/536 [33:19<07:12,  2.32s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 350/536 [33:21<07:09,  2.31s/it]
                                                 
{'loss': '0.4499', 'grad_norm': '0.2148', 'learning_rate': '3.264e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6691', 'tokens/total': 45875200, 'tokens/trainable': 42681132, 'epoch': '1.336'}

 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 350/536 [33:21<07:09,  2.31s/it]
 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 351/536 [33:24<07:04,  2.29s/it][2026-03-16 19:44:58,459] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-351


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.55s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.55s/it]

 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 352/536 [35:02<1:35:17, 31.07s/it]
 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 353/536 [35:04<1:08:31, 22.47s/it]
 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 354/536 [35:06<49:47, 16.42s/it]  
 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 355/536 [35:09<36:49, 12.21s/it]
                                                 
{'loss': '0.4529', 'grad_norm': '0.249', 'learning_rate': '3.113e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6371', 'tokens/total': 46530560, 'tokens/trainable': 43292904, 'epoch': '1.355'}

 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ   | 355/536 [35:09<36:49, 12.21s/it]
 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 356/536 [35:11<27:37,  9.21s/it]
 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 357/536 [35:13<21:14,  7.12s/it]
 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 358/536 [35:16<16:48,  5.67s/it]
 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 359/536 [35:18<13:56,  4.73s/it]
 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 360/536 [35:21<11:51,  4.04s/it]
                                                 
{'loss': '0.4461', 'grad_norm': '0.2207', 'learning_rate': '2.963e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6150', 'tokens/total': 47185920, 'tokens/trainable': 43900272, 'epoch': '1.374'}

 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 360/536 [35:21<11:51,  4.04s/it]
 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹   | 361/536 [35:23<10:14,  3.51s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 362/536 [35:25<09:24,  3.25s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 363/536 [35:28<08:33,  2.97s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 364/536 [35:30<07:52,  2.75s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 365/536 [35:32<07:23,  2.59s/it]
                                                 
{'loss': '0.4581', 'grad_norm': '0.3555', 'learning_rate': '2.816e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6767', 'tokens/total': 47841280, 'tokens/trainable': 44509872, 'epoch': '1.393'}

 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 365/536 [35:32<07:23,  2.59s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 366/536 [35:34<07:04,  2.50s/it]
 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 367/536 [35:37<06:49,  2.42s/it]
 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š   | 368/536 [35:39<06:39,  2.38s/it]
 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 369/536 [35:41<06:33,  2.36s/it]
 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 370/536 [35:44<06:33,  2.37s/it]
                                                 
{'loss': '0.4483', 'grad_norm': '0.2109', 'learning_rate': '2.671e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6359', 'tokens/total': 48496640, 'tokens/trainable': 45121444, 'epoch': '1.412'}

 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 370/536 [35:44<06:33,  2.37s/it]
 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 371/536 [35:46<06:27,  2.35s/it]
 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 372/536 [35:48<06:25,  2.35s/it]
 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 373/536 [35:51<06:17,  2.31s/it]
 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 374/536 [35:53<06:12,  2.30s/it]
 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 375/536 [35:55<06:13,  2.32s/it]
                                                 
{'loss': '0.4475', 'grad_norm': '0.2617', 'learning_rate': '2.528e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6505', 'tokens/total': 49152000, 'tokens/trainable': 45733296, 'epoch': '1.431'}

 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰   | 375/536 [35:55<06:13,  2.32s/it]
 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 376/536 [35:58<06:09,  2.31s/it]
 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 377/536 [36:00<06:26,  2.43s/it]
 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 378/536 [36:03<06:17,  2.39s/it][2026-03-16 19:47:37,290] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-378


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.77s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.77s/it]

 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 379/536 [37:39<1:20:25, 30.73s/it]
 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 380/536 [37:42<57:44, 22.21s/it]  
                                                 
{'loss': '0.4467', 'grad_norm': '0.208', 'learning_rate': '2.388e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6639', 'tokens/total': 49807360, 'tokens/trainable': 46341868, 'epoch': '1.45'}

 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 380/536 [37:42<57:44, 22.21s/it]
 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   | 381/536 [37:44<41:54, 16.22s/it]
 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 382/536 [37:46<31:01, 12.09s/it]
 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 383/536 [37:49<23:20,  9.16s/it]
 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 384/536 [37:51<17:57,  7.09s/it]
 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 385/536 [37:53<14:15,  5.66s/it]
                                                 
{'loss': '0.4373', 'grad_norm': '0.2129', 'learning_rate': '2.251e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6487', 'tokens/total': 50462720, 'tokens/trainable': 46948144, 'epoch': '1.469'}

 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 385/536 [37:53<14:15,  5.66s/it]
 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 386/536 [37:56<11:39,  4.67s/it]
 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 387/536 [37:58<09:46,  3.94s/it]
 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 388/536 [38:00<08:31,  3.45s/it]
 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 389/536 [38:03<07:40,  3.14s/it]
 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 390/536 [38:05<07:00,  2.88s/it]
                                                 
{'loss': '0.452', 'grad_norm': '0.2314', 'learning_rate': '2.117e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 51118080, 'tokens/trainable': 47558056, 'epoch': '1.489'}

 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 390/536 [38:05<07:00,  2.88s/it]
 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 391/536 [38:08<07:22,  3.05s/it]
 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 392/536 [38:11<06:49,  2.85s/it]
 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 393/536 [38:13<06:22,  2.68s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 394/536 [38:15<06:02,  2.55s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 395/536 [38:18<06:01,  2.56s/it]
                                                 
{'loss': '0.4435', 'grad_norm': '0.2139', 'learning_rate': '1.985e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5877', 'tokens/total': 51773440, 'tokens/trainable': 48168520, 'epoch': '1.508'}

 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž  | 395/536 [38:18<06:01,  2.56s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 396/536 [38:20<05:47,  2.48s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 397/536 [38:22<05:35,  2.42s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 398/536 [38:25<05:27,  2.37s/it]
 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 399/536 [38:27<05:22,  2.35s/it]
 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 400/536 [38:29<05:21,  2.37s/it]
                                                 
{'loss': '0.4444', 'grad_norm': '0.2236', 'learning_rate': '1.857e-06', 'ppl': '1.56', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6406', 'tokens/total': 52428800, 'tokens/trainable': 48779416, 'epoch': '1.527'}

 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 400/536 [38:29<05:21,  2.37s/it]
 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–  | 401/536 [38:32<05:15,  2.34s/it]
 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 402/536 [38:34<05:10,  2.32s/it]
 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 403/536 [38:36<05:10,  2.34s/it]
 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 404/536 [38:39<05:07,  2.33s/it]
 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 405/536 [38:41<05:03,  2.32s/it]
                                                 
{'loss': '0.4557', 'grad_norm': '0.2324', 'learning_rate': '1.732e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6719', 'tokens/total': 53084160, 'tokens/trainable': 49387032, 'epoch': '1.546'}

 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 405/536 [38:41<05:03,  2.32s/it][2026-03-16 19:50:15,755] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-405


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.46s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.46s/it]

 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 406/536 [40:19<1:06:58, 30.91s/it]
 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 407/536 [40:21<48:00, 22.33s/it]  
 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ  | 408/536 [40:23<34:49, 16.33s/it]
 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 409/536 [40:25<25:39, 12.12s/it]
 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 410/536 [40:28<19:38,  9.36s/it]
                                                 
{'loss': '0.4617', 'grad_norm': '0.2168', 'learning_rate': '1.611e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5250', 'tokens/total': 53739520, 'tokens/trainable': 49995344, 'epoch': '1.565'}

 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 410/536 [40:28<19:38,  9.36s/it]
 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 411/536 [40:31<15:03,  7.23s/it]
 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 412/536 [40:33<11:54,  5.76s/it]
 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 413/536 [40:36<09:53,  4.83s/it]
 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 414/536 [40:38<08:16,  4.07s/it]
 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 415/536 [40:40<07:07,  3.53s/it]
                                                 
{'loss': '0.4492', 'grad_norm': '0.2217', 'learning_rate': '1.493e-06', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6671', 'tokens/total': 54394880, 'tokens/trainable': 50603264, 'epoch': '1.584'}

 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹  | 415/536 [40:40<07:07,  3.53s/it]
 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 416/536 [40:43<06:19,  3.17s/it]
 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 417/536 [40:45<05:45,  2.90s/it]
 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 418/536 [40:47<05:26,  2.77s/it]
 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 419/536 [40:50<05:18,  2.73s/it]
 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 420/536 [40:52<05:02,  2.61s/it]
                                                 
{'loss': '0.4522', 'grad_norm': '0.2676', 'learning_rate': '1.379e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 55050240, 'tokens/trainable': 51213612, 'epoch': '1.603'}

 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 420/536 [40:52<05:02,  2.61s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 421/536 [40:55<04:49,  2.52s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š  | 422/536 [40:57<04:40,  2.46s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 423/536 [40:59<04:40,  2.49s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 424/536 [41:02<04:29,  2.41s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 425/536 [41:04<04:21,  2.36s/it]
                                                 
{'loss': '0.4414', 'grad_norm': '0.2168', 'learning_rate': '1.269e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6755', 'tokens/total': 55705600, 'tokens/trainable': 51819592, 'epoch': '1.622'}

 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 425/536 [41:04<04:21,  2.36s/it]
 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 426/536 [41:06<04:17,  2.34s/it]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 427/536 [41:08<04:13,  2.32s/it]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰  | 428/536 [41:11<04:08,  2.30s/it]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 429/536 [41:13<04:07,  2.31s/it]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 430/536 [41:15<04:04,  2.31s/it]
                                                 
{'loss': '0.4532', 'grad_norm': '0.2217', 'learning_rate': '1.163e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6642', 'tokens/total': 56360960, 'tokens/trainable': 52431520, 'epoch': '1.641'}

 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 430/536 [41:15<04:04,  2.31s/it]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 431/536 [41:18<04:10,  2.38s/it]
 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 432/536 [41:20<04:04,  2.35s/it][2026-03-16 19:52:55,057] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-432


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.45s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.45s/it]

 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 433/536 [42:58<53:14, 31.01s/it]
 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 434/536 [43:00<38:05, 22.41s/it]
 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 435/536 [43:03<27:33, 16.37s/it]
                                                 
{'loss': '0.4605', 'grad_norm': '0.3574', 'learning_rate': '1.061e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 57016320, 'tokens/trainable': 53041944, 'epoch': '1.66'}

 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ  | 435/536 [43:03<27:33, 16.37s/it]
 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 436/536 [43:05<20:20, 12.20s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 437/536 [43:07<15:11,  9.20s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 438/536 [43:10<11:41,  7.16s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 439/536 [43:12<09:11,  5.68s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 440/536 [43:14<07:29,  4.68s/it]
                                                 
{'loss': '0.446', 'grad_norm': '0.2119', 'learning_rate': '9.626e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6513', 'tokens/total': 57671680, 'tokens/trainable': 53647180, 'epoch': '1.679'}

 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 440/536 [43:14<07:29,  4.68s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 441/536 [43:17<06:23,  4.04s/it]
 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 442/536 [43:19<05:31,  3.53s/it]
 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 443/536 [43:21<04:53,  3.16s/it]
 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 444/536 [43:24<04:26,  2.90s/it]
 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 445/536 [43:26<04:04,  2.69s/it]
                                                 
{'loss': '0.4299', 'grad_norm': '0.2188', 'learning_rate': '8.688e-07', 'ppl': '1.537', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6951', 'tokens/total': 58327040, 'tokens/trainable': 54256432, 'epoch': '1.698'}

 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 445/536 [43:26<04:04,  2.69s/it]
 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 446/536 [43:28<03:52,  2.58s/it]
 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 447/536 [43:31<03:41,  2.49s/it]
 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 448/536 [43:33<03:32,  2.41s/it]
 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 449/536 [43:35<03:30,  2.42s/it]
 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 450/536 [43:38<03:23,  2.37s/it]
                                                 
{'loss': '0.4583', 'grad_norm': '0.2188', 'learning_rate': '7.794e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6777', 'tokens/total': 58982400, 'tokens/trainable': 54863148, 'epoch': '1.718'}

 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 450/536 [43:38<03:23,  2.37s/it]
 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 451/536 [43:40<03:28,  2.46s/it]
 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 452/536 [43:43<03:24,  2.44s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 453/536 [43:45<03:19,  2.40s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 454/536 [43:47<03:15,  2.38s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 455/536 [43:50<03:12,  2.38s/it]
                                                 
{'loss': '0.4523', 'grad_norm': '0.2119', 'learning_rate': '6.945e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6454', 'tokens/total': 59637760, 'tokens/trainable': 55471580, 'epoch': '1.737'}

 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 455/536 [43:50<03:12,  2.38s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 456/536 [43:52<03:15,  2.44s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 457/536 [43:54<03:10,  2.41s/it]
 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 458/536 [43:57<03:11,  2.46s/it]
 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 459/536 [43:59<03:05,  2.41s/it][2026-03-16 19:55:34,637] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-459


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.63s/it]

 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 460/536 [45:37<39:11, 30.94s/it]
                                                 
{'loss': '0.4523', 'grad_norm': '0.2148', 'learning_rate': '6.141e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6711', 'tokens/total': 60293120, 'tokens/trainable': 56081832, 'epoch': '1.756'}

 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 460/536 [45:37<39:11, 30.94s/it]
 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 461/536 [45:39<27:55, 22.34s/it]
 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 462/536 [45:41<20:06, 16.30s/it]
 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 463/536 [45:44<14:42, 12.09s/it]
 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 464/536 [45:46<10:58,  9.14s/it]
 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 465/536 [45:48<08:21,  7.06s/it]
                                                 
{'loss': '0.4461', 'grad_norm': '0.4551', 'learning_rate': '5.383e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6903', 'tokens/total': 60948480, 'tokens/trainable': 56694240, 'epoch': '1.775'}

 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 465/536 [45:48<08:21,  7.06s/it]
 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 466/536 [45:50<06:33,  5.62s/it]
 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 467/536 [45:53<05:23,  4.69s/it]
 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 468/536 [45:55<04:29,  3.97s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 469/536 [45:57<03:50,  3.44s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 470/536 [46:01<03:45,  3.42s/it]
                                                 
{'loss': '0.4341', 'grad_norm': '0.208', 'learning_rate': '4.673e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4512', 'tokens/total': 61603840, 'tokens/trainable': 57301404, 'epoch': '1.794'}

 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 470/536 [46:01<03:45,  3.42s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 471/536 [46:03<03:20,  3.09s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 472/536 [46:05<03:02,  2.86s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 473/536 [46:08<02:50,  2.70s/it]
 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 474/536 [46:10<02:39,  2.58s/it]
 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 475/536 [46:12<02:34,  2.54s/it]
                                                 
{'loss': '0.4627', 'grad_norm': '0.2461', 'learning_rate': '4.011e-07', 'ppl': '1.588', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6249', 'tokens/total': 62259200, 'tokens/trainable': 57910216, 'epoch': '1.813'}

 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 475/536 [46:12<02:34,  2.54s/it]
 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 476/536 [46:15<02:26,  2.45s/it]
 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 477/536 [46:17<02:24,  2.45s/it]
 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 478/536 [46:20<02:25,  2.51s/it]
 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 479/536 [46:22<02:19,  2.45s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 480/536 [46:24<02:13,  2.39s/it]
                                                 
{'loss': '0.4538', 'grad_norm': '0.2178', 'learning_rate': '3.397e-07', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6635', 'tokens/total': 62914560, 'tokens/trainable': 58517712, 'epoch': '1.832'}

 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 480/536 [46:24<02:13,  2.39s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 481/536 [46:27<02:09,  2.36s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 482/536 [46:29<02:05,  2.33s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 483/536 [46:31<02:03,  2.33s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 484/536 [46:34<02:01,  2.33s/it]
 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 485/536 [46:36<02:04,  2.44s/it]
                                                 
{'loss': '0.4395', 'grad_norm': '0.208', 'learning_rate': '2.833e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6504', 'tokens/total': 63569920, 'tokens/trainable': 59125608, 'epoch': '1.851'}

 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 485/536 [46:36<02:04,  2.44s/it]
 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 486/536 [46:39<01:59,  2.39s/it][2026-03-16 19:58:13,418] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-486


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.82s/it]

 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 487/536 [48:16<25:09, 30.81s/it]
 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 488/536 [48:18<17:48, 22.26s/it]
 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 489/536 [48:20<12:44, 16.26s/it]
 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 490/536 [48:24<09:32, 12.45s/it]
                                                 
{'loss': '0.4478', 'grad_norm': '0.2236', 'learning_rate': '2.318e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4290', 'tokens/total': 64225280, 'tokens/trainable': 59733412, 'epoch': '1.87'}

 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 490/536 [48:24<09:32, 12.45s/it]
 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 491/536 [48:26<07:02,  9.39s/it]
 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 492/536 [48:28<05:19,  7.26s/it]
 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 493/536 [48:31<04:08,  5.78s/it]
 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 494/536 [48:33<03:22,  4.82s/it]
 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 495/536 [48:36<02:47,  4.08s/it]
                                                 
{'loss': '0.4362', 'grad_norm': '0.2129', 'learning_rate': '1.854e-07', 'ppl': '1.547', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6524', 'tokens/total': 64880640, 'tokens/trainable': 60339904, 'epoch': '1.889'}

 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 495/536 [48:36<02:47,  4.08s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 496/536 [48:38<02:25,  3.64s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 497/536 [48:40<02:06,  3.23s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 498/536 [48:43<01:52,  2.96s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 499/536 [48:45<01:42,  2.76s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 500/536 [48:47<01:34,  2.62s/it]
                                                 
{'loss': '0.4656', 'grad_norm': '0.2217', 'learning_rate': '1.441e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6649', 'tokens/total': 65536000, 'tokens/trainable': 60945412, 'epoch': '1.908'}

 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 500/536 [48:47<01:34,  2.62s/it]
 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 501/536 [48:50<01:29,  2.55s/it]
 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 502/536 [48:52<01:25,  2.52s/it]
 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 503/536 [48:54<01:20,  2.44s/it]
 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 504/536 [48:57<01:16,  2.39s/it]
 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 505/536 [48:59<01:13,  2.36s/it]
                                                 
{'loss': '0.4466', 'grad_norm': '0.2129', 'learning_rate': '1.079e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6658', 'tokens/total': 66191360, 'tokens/trainable': 61550936, 'epoch': '1.927'}

 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 505/536 [48:59<01:13,  2.36s/it]
 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 506/536 [49:01<01:09,  2.33s/it]
 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 507/536 [49:04<01:07,  2.31s/it]
 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 508/536 [49:06<01:04,  2.31s/it]
 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 509/536 [49:08<01:04,  2.37s/it]
 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 510/536 [49:11<01:01,  2.35s/it]
                                                 
{'loss': '0.4754', 'grad_norm': '0.2168', 'learning_rate': '7.691e-08', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 66846720, 'tokens/trainable': 62157088, 'epoch': '1.947'}

 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 510/536 [49:11<01:01,  2.35s/it]
 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 511/536 [49:13<00:58,  2.34s/it]
 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 512/536 [49:15<00:55,  2.32s/it]
 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 513/536 [49:18<00:53,  2.31s/it][2026-03-16 20:00:52,618] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-513


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.79s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.79s/it]

 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 514/536 [50:57<11:33, 31.50s/it]
 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 515/536 [51:00<07:58, 22.77s/it]
                                                 
{'loss': '0.4548', 'grad_norm': '0.2217', 'learning_rate': '5.11e-08', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6342', 'tokens/total': 67502080, 'tokens/trainable': 62762592, 'epoch': '1.966'}

 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 515/536 [51:00<07:58, 22.77s/it]
 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 516/536 [51:02<05:32, 16.65s/it]
 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 517/536 [51:04<03:54, 12.32s/it]
 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 518/536 [51:06<02:47,  9.30s/it]
 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 519/536 [51:09<02:02,  7.18s/it]
 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 520/536 [51:11<01:32,  5.76s/it]
                                                 
{'loss': '0.4544', 'grad_norm': '0.2314', 'learning_rate': '3.054e-08', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6125', 'tokens/total': 68157440, 'tokens/trainable': 63366800, 'epoch': '1.985'}

 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 520/536 [51:11<01:32,  5.76s/it]
 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 521/536 [51:13<01:10,  4.72s/it]
 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 522/536 [51:16<00:55,  3.98s/it]
 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 523/536 [51:18<00:45,  3.47s/it]
 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 524/536 [51:20<00:38,  3.19s/it]
 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 525/536 [51:24<00:35,  3.21s/it]
                                                 
{'loss': '0.4504', 'grad_norm': '0.2246', 'learning_rate': '1.522e-08', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6753', 'tokens/total': 68812800, 'tokens/trainable': 63975056, 'epoch': '2.004'}

 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 525/536 [51:24<00:35,  3.21s/it]
 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 526/536 [51:26<00:29,  2.92s/it]
 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 527/536 [51:28<00:24,  2.75s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 528/536 [51:31<00:21,  2.71s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 529/536 [51:33<00:18,  2.60s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 530/536 [51:36<00:15,  2.52s/it]
                                                 
{'loss': '0.4546', 'grad_norm': '0.2109', 'learning_rate': '5.182e-09', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6575', 'tokens/total': 69468160, 'tokens/trainable': 64586448, 'epoch': '2.023'}

 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 530/536 [51:36<00:15,  2.52s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 531/536 [51:38<00:12,  2.45s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 532/536 [51:40<00:09,  2.40s/it]
 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 533/536 [51:43<00:07,  2.40s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 534/536 [51:45<00:04,  2.39s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 535/536 [51:47<00:02,  2.42s/it]
                                                 
{'loss': '0.4493', 'grad_norm': '0.2314', 'learning_rate': '4.231e-10', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6115', 'tokens/total': 70123520, 'tokens/trainable': 65199364, 'epoch': '2.042'}

100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 535/536 [51:47<00:02,  2.42s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [51:50<00:00,  2.37s/it][2026-03-16 20:03:25,941] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-536


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.68s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:16<00:00, 16.68s/it]

                                                 
{'train_runtime': '3210', 'train_samples_per_second': '2.672', 'train_steps_per_second': '0.167', 'train_loss': '0.4897', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'epoch': '2.046', 'tokens/train_per_sec_per_gpu': '6757'}

100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [53:26<00:00,  2.37s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 536/536 [53:26<00:00,  5.98s/it]
[2026-03-16 20:04:52,263] [INFO] [axolotl.train.save_trained_model:237] [PID:213] Training completed! Saving trained model to ./outputs/qwen3-sft-stmt-tk/.
[2026-03-16 20:05:01,009] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.53s/it]
Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:17<00:00, 17.53s/it]
[2026-03-16 20:05:19,091] [INFO] [axolotl.train.save_trained_model:351] [PID:213] Model successfully saved to ./outputs/qwen3-sft-stmt-tk/