File size: 60,542 Bytes
cb08aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
[2026-03-27 09:52:09,822] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:64102] bf16 support detected, enabling for this configuration.
[2026-03-27 09:52:09,985] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:64102] baseline 0.000GB ()
[2026-03-27 09:52:09,985] [INFO] [axolotl.cli.config.load_cfg:341] [PID:64102] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "ministral3-3b-qlora.yaml",
  "base_model": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "base_model_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "batch_size": 1,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_89",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 0,
  "dataset_num_proc": 16,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "AlexHung29629/test_data_123",
      "trust_remote_code": false,
      "type": {
        "field_instruction": "input",
        "field_output": "output",
        "field_system": "system",
        "format": "{instruction}",
        "no_input_format": "{instruction}",
        "system_prompt": ""
      }
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flex_attention": true,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "include_tkps": true,
  "is_multimodal": true,
  "layer_offloading": false,
  "learning_rate": 2e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 1,
  "model_config_type": "mistral3",
  "model_config_type_text": "ministral3",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/out",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.5,
  "saves_per_epoch": 1,
  "scaling_softmax": true,
  "sequence_len": 32768,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "tokenizer_save_jinja_files": true,
  "tokenizer_use_mistral_common": false,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "unfrozen_parameters": [
    "^model.language_model.norm.weight$",
    "^model.language_model.layers.2[0-5].[.a-z_]+$"
  ],
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.0,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-03-27 09:52:10,190] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:64102] Loaded image size: 1540 from model config
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 / </s>
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 / <s>
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 / <pad>
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 / <unk>
[2026-03-27 09:52:11,874] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:64102] Loading prepared dataset from disk at last_run_prepared/0a6d77d9f0fbd2dd6692eaf810500a77...
[2026-03-27 09:52:11,880] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:64102] total_num_tokens: 336_571
[2026-03-27 09:52:11,881] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:64102] `total_supervised_tokens: 3_275`
[2026-03-27 09:52:14,284] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9476313591003418
[2026-03-27 09:52:15,197] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9123454093933105
[2026-03-27 09:52:16,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9608397483825684
[2026-03-27 09:52:17,093] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9345409870147705
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:64102] data_loader_len: 12
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:64102] sample_packing_eff_est across ranks: [0.9337574351917614]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:64102] sample_packing_eff_est: 0.94
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:64102] total_num_steps: 24
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:64102] Maximum number of steps set at 24
[2026-03-27 09:52:17,148] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:64102] loading tokenizer... mistralai/Ministral-3-3B-Instruct-2512-BF16
[2026-03-27 09:52:19,018] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 / </s>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 / <s>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 / <pad>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 / <unk>
[2026-03-27 09:52:23,670] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:64102] Loading model
[2026-03-27 09:52:23,794] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:64102] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-27 09:52:23,796] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:64102] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-27 09:52:23,797] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:402] [PID:64102] Applying multipack dataloader patch for sample packing...
[2026-03-27 09:52:23,820] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:64102] Applying Cut Cross Entropy to model type: mistral3

Loading weights:   0%|                                                                                                                                                                      | 0/458 [00:00<?, ?it/s]
Loading weights: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 458/458 [00:00<00:00, 5938.85it/s]
[2026-03-27 09:52:25,202] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:64102] Converting modules to torch.bfloat16
[2026-03-27 09:52:25,776] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:64102] Memory usage after model load 0.000GB ()
[2026-03-27 09:52:26,681] [INFO] [axolotl.monkeypatch.scaled_softmax_attn.patch_scaled_softmax_attention:46] [PID:64102] Patched flex_attention with SSMax (s=0.43, b=0.0)
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.norm.weight
model.language_model.layers.20.self_attn.q_proj.weight
model.language_model.layers.20.self_attn.k_proj.weight
model.language_model.layers.20.self_attn.v_proj.weight
model.language_model.layers.20.self_attn.o_proj.weight
model.language_model.layers.20.mlp.gate_proj.weight
model.language_model.layers.20.mlp.up_proj.weight
model.language_model.layers.20.mlp.down_proj.weight
model.language_model.layers.20.input_layernorm.weight
model.language_model.layers.20.post_attention_layernorm.weight
model.language_model.layers.21.self_attn.q_proj.weight
model.language_model.layers.21.self_attn.k_proj.weight
model.language_model.layers.21.self_attn.v_proj.weight
model.language_model.layers.21.self_attn.o_proj.weight
model.language_model.layers.21.mlp.gate_proj.weight
model.language_model.layers.21.mlp.up_proj.weight
model.language_model.layers.21.mlp.down_proj.weight
model.language_model.layers.21.input_layernorm.weight
model.language_model.layers.21.post_attention_layernorm.weight
model.language_model.layers.22.self_attn.q_proj.weight
model.language_model.layers.22.self_attn.k_proj.weight
model.language_model.layers.22.self_attn.v_proj.weight
model.language_model.layers.22.self_attn.o_proj.weight
model.language_model.layers.22.mlp.gate_proj.weight
model.language_model.layers.22.mlp.up_proj.weight
model.language_model.layers.22.mlp.down_proj.weight
model.language_model.layers.22.input_layernorm.weight
model.language_model.layers.22.post_attention_layernorm.weight
model.language_model.layers.23.self_attn.q_proj.weight
model.language_model.layers.23.self_attn.k_proj.weight
model.language_model.layers.23.self_attn.v_proj.weight
model.language_model.layers.23.self_attn.o_proj.weight
model.language_model.layers.23.mlp.gate_proj.weight
model.language_model.layers.23.mlp.up_proj.weight
model.language_model.layers.23.mlp.down_proj.weight
model.language_model.layers.23.input_layernorm.weight
model.language_model.layers.23.post_attention_layernorm.weight
model.language_model.layers.24.self_attn.q_proj.weight
model.language_model.layers.24.self_attn.k_proj.weight
model.language_model.layers.24.self_attn.v_proj.weight
model.language_model.layers.24.self_attn.o_proj.weight
model.language_model.layers.24.mlp.gate_proj.weight
model.language_model.layers.24.mlp.up_proj.weight
model.language_model.layers.24.mlp.down_proj.weight
model.language_model.layers.24.input_layernorm.weight
model.language_model.layers.24.post_attention_layernorm.weight
model.language_model.layers.25.self_attn.q_proj.weight
model.language_model.layers.25.self_attn.k_proj.weight
model.language_model.layers.25.self_attn.v_proj.weight
model.language_model.layers.25.self_attn.o_proj.weight
model.language_model.layers.25.mlp.gate_proj.weight
model.language_model.layers.25.mlp.up_proj.weight
model.language_model.layers.25.mlp.down_proj.weight
model.language_model.layers.25.input_layernorm.weight
model.language_model.layers.25.post_attention_layernorm.weight
model.language_model.norm.weight
[2026-03-27 09:52:30,361] [INFO] [axolotl.train.save_initial_configs:421] [PID:64102] Pre-saving tokenizer to ./outputs/out...
[2026-03-27 09:52:30,482] [INFO] [axolotl.train.save_initial_configs:426] [PID:64102] Pre-saving model config to ./outputs/out...
[2026-03-27 09:52:30,484] [INFO] [axolotl.train.save_initial_configs:430] [PID:64102] Pre-saving processor to ./outputs/out...
[2026-03-27 09:52:31,014] [INFO] [axolotl.train.execute_training:222] [PID:64102] Starting trainer...
[2026-03-27 09:52:33,177] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.922748327255249
[2026-03-27 09:52:34,095] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9173660278320312
[2026-03-27 09:52:34,994] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.8986146450042725
[2026-03-27 09:52:35,943] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9490597248077393
[2026-03-27 09:52:35,943] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]

  0%|                                                                                                                                                                                        | 0/24 [00:00<?, ?it/s][2026-03-27 09:52:39,230] [WARNING] [py.warnings._showwarnmsg:110] [PID:64102] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
  _warn_once(


  4%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                                                                        | 1/24 [00:18<06:55, 18.09s/it]
                                                                                                                                                                                                                    
{'loss': '7.497', 'grad_norm': '235', 'learning_rate': '2e-05', 'ppl': '1802', 'memory/max_active (GiB)': '17.98', 'memory/max_allocated (GiB)': '17.98', 'memory/device_reserved (GiB)': '18.96', 'tokens/train_per_sec_per_gpu': '92.75', 'tokens/total': 32768, 'tokens/trainable': 1575, 'epoch': '0.08333'}

  4%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                                                                        | 1/24 [00:18<06:55, 18.09s/it]
  8%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                                                                 | 2/24 [00:29<05:12, 14.22s/it]
                                                                                                                                                                                                                    
{'loss': '4.327', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '75.75', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.94', 'tokens/total': 65536, 'tokens/trainable': 1815, 'epoch': '0.1667'}

  8%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                                                                 | 2/24 [00:29<05:12, 14.22s/it]
 12%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                                                                          | 3/24 [00:44<05:01, 14.33s/it]
                                                                                                                                                                                                                    
{'loss': '3.383', 'grad_norm': '276', 'learning_rate': '2e-05', 'ppl': '29.46', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.705', 'tokens/total': 98304, 'tokens/trainable': 1854, 'epoch': '0.25'}

 12%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                                                                          | 3/24 [00:44<05:01, 14.33s/it]
 17%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                                                  | 4/24 [00:58<04:47, 14.39s/it]
                                                                                                                                                                                                                    
{'loss': '4.797', 'grad_norm': '111', 'learning_rate': '2e-05', 'ppl': '121.1', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.89', 'tokens/total': 131072, 'tokens/trainable': 2141, 'epoch': '0.3333'}

 17%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                                                  | 4/24 [00:58<04:47, 14.39s/it]
 21%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                                           | 5/24 [01:06<03:52, 12.24s/it]
                                                                                                                                                                                                                    
{'loss': '2.682', 'grad_norm': '54', 'learning_rate': '2e-05', 'ppl': '14.61', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '44.22', 'tokens/total': 163840, 'tokens/trainable': 2512, 'epoch': '0.4167'}

 21%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                                           | 5/24 [01:06<03:52, 12.24s/it]
 25%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                                                    | 6/24 [01:21<03:54, 13.03s/it]
                                                                                                                                                                                                                    
{'loss': '3.572', 'grad_norm': '90', 'learning_rate': '2e-05', 'ppl': '35.58', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.84', 'tokens/total': 196608, 'tokens/trainable': 2771, 'epoch': '0.5'}

 25%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                                                    | 6/24 [01:21<03:54, 13.03s/it]
 29%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                            | 7/24 [01:35<03:44, 13.18s/it]
                                                                                                                                                                                                                    
{'loss': '1.781', 'grad_norm': '202', 'learning_rate': '2e-05', 'ppl': '5.938', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.935', 'tokens/total': 229376, 'tokens/trainable': 2797, 'epoch': '0.5833'}

 29%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                                                                                                            | 7/24 [01:35<03:44, 13.18s/it]
 33%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                     | 8/24 [01:44<03:14, 12.15s/it]
                                                                                                                                                                                                                    
{'loss': '3.93', 'grad_norm': '55.75', 'learning_rate': '2e-05', 'ppl': '50.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.41', 'tokens/total': 262144, 'tokens/trainable': 2999, 'epoch': '0.6667'}

 33%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                                                                                     | 8/24 [01:44<03:14, 12.15s/it]
 38%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                              | 9/24 [01:59<03:14, 12.99s/it]
                                                                                                                                                                                                                    
{'loss': '3.408', 'grad_norm': '233', 'learning_rate': '2e-05', 'ppl': '30.21', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.624', 'tokens/total': 294912, 'tokens/trainable': 3023, 'epoch': '0.75'}

 38%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                                                              | 9/24 [01:59<03:14, 12.99s/it]
 42%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‰                                                                                                      | 10/24 [02:13<03:05, 13.25s/it]
                                                                                                                                                                                                                    
{'loss': '1.711', 'grad_norm': '214', 'learning_rate': '2e-05', 'ppl': '5.535', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.03', 'tokens/total': 327680, 'tokens/trainable': 3051, 'epoch': '0.8333'}

 42%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‰                                                                                                      | 10/24 [02:13<03:05, 13.25s/it]
 46%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                                                                                              | 11/24 [02:27<02:54, 13.39s/it]
                                                                                                                                                                                                                    
{'loss': '1.723', 'grad_norm': '166', 'learning_rate': '2e-05', 'ppl': '5.604', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.637', 'tokens/total': 360448, 'tokens/trainable': 3087, 'epoch': '0.9167'}

 46%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                                                                                              | 11/24 [02:27<02:54, 13.39s/it]
 50%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Œ                                                                                       | 12/24 [02:42<02:48, 14.00s/it]
                                                                                                                                                                                                                    
{'loss': '4.694', 'grad_norm': '94.5', 'learning_rate': '2e-05', 'ppl': '109.3', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.2', 'tokens/total': 393216, 'tokens/trainable': 3275, 'epoch': '1'}

 50%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Œ                                                                                       | 12/24 [02:42<02:48, 14.00s/it][2026-03-27 09:55:18,727] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-12


Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.19s/it]
Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.19s/it]

 54%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Š                                                                                | 13/24 [03:08<03:11, 17.45s/it]
                                                                                                                                                                                                                    
{'loss': '1.37', 'grad_norm': '53.25', 'learning_rate': '2e-05', 'ppl': '3.936', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '10.1', 'tokens/total': 425984, 'tokens/trainable': 3411, 'epoch': '1.083'}

 54%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Š                                                                                | 13/24 [03:08<03:11, 17.45s/it]
 58%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                         | 14/24 [03:16<02:27, 14.79s/it]
                                                                                                                                                                                                                    
{'loss': '1.797', 'grad_norm': '22.75', 'learning_rate': '2e-05', 'ppl': '6.033', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '69.67', 'tokens/total': 458752, 'tokens/trainable': 4011, 'epoch': '1.167'}

 58%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ                                                                         | 14/24 [03:16<02:27, 14.79s/it]
 62%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                                                                 | 15/24 [03:30<02:10, 14.48s/it]
                                                                                                                                                                                                                    
{'loss': '0.4977', 'grad_norm': '110.5', 'learning_rate': '2e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.045', 'tokens/total': 491520, 'tokens/trainable': 4039, 'epoch': '1.25'}

 62%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                                                                 | 15/24 [03:30<02:10, 14.48s/it]
 67%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                          | 16/24 [03:45<01:57, 14.75s/it]
                                                                                                                                                                                                                    
{'loss': '2.569', 'grad_norm': '52.5', 'learning_rate': '2e-05', 'ppl': '13.05', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.26', 'tokens/total': 524288, 'tokens/trainable': 4227, 'epoch': '1.333'}

 67%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹                                                          | 16/24 [03:45<01:57, 14.75s/it]
 71%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‰                                                   | 17/24 [03:59<01:41, 14.48s/it]
                                                                                                                                                                                                                    
{'loss': '0.1539', 'grad_norm': '23.75', 'learning_rate': '2e-05', 'ppl': '1.166', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '3.55', 'tokens/total': 557056, 'tokens/trainable': 4276, 'epoch': '1.417'}

 71%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‰                                                   | 17/24 [03:59<01:41, 14.48s/it]
 75%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                           | 18/24 [04:14<01:27, 14.54s/it]
                                                                                                                                                                                                                    
{'loss': '1.552', 'grad_norm': '41.25', 'learning_rate': '2e-05', 'ppl': '4.722', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.7', 'tokens/total': 589824, 'tokens/trainable': 4535, 'epoch': '1.5'}

 75%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Ž                                           | 18/24 [04:14<01:27, 14.54s/it]
 79%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Œ                                    | 19/24 [04:29<01:13, 14.64s/it]
                                                                                                                                                                                                                    
{'loss': '0.6448', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.906', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.618', 'tokens/total': 622592, 'tokens/trainable': 4559, 'epoch': '1.583'}

 79%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Œ                                    | 19/24 [04:29<01:13, 14.64s/it]
 83%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Š                             | 20/24 [04:44<00:58, 14.74s/it]
                                                                                                                                                                                                                    
{'loss': '0.6573', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.93', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.617', 'tokens/total': 655360, 'tokens/trainable': 4598, 'epoch': '1.667'}

 83%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–Š                             | 20/24 [04:44<00:58, 14.74s/it]
 88%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                     | 21/24 [04:56<00:41, 13.93s/it]
                                                                                                                                                                                                                    
{'loss': '1.997', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '7.369', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '8.91', 'tokens/total': 688128, 'tokens/trainable': 4705, 'epoch': '1.75'}

 88%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–                     | 21/24 [04:56<00:41, 13.93s/it]
 92%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–              | 22/24 [05:11<00:28, 14.21s/it]
                                                                                                                                                                                                                    
{'loss': '2.558', 'grad_norm': '59.75', 'learning_rate': '2e-05', 'ppl': '12.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.36', 'tokens/total': 720896, 'tokens/trainable': 4992, 'epoch': '1.833'}

 92%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–              | 22/24 [05:11<00:28, 14.21s/it]
 96%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹       | 23/24 [05:22<00:13, 13.41s/it]
                                                                                                                                                                                                                    
{'loss': '2.735', 'grad_norm': '60.5', 'learning_rate': '2e-05', 'ppl': '15.41', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '7.224', 'tokens/total': 753664, 'tokens/trainable': 5075, 'epoch': '1.917'}

 96%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–‹       | 23/24 [05:22<00:13, 13.41s/it]
100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 24/24 [05:37<00:00, 13.76s/it]
                                                                                                                                                                                                                    
{'loss': '4.172', 'grad_norm': '46.5', 'learning_rate': '2e-05', 'ppl': '64.87', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '101.3', 'tokens/total': 786432, 'tokens/trainable': 6550, 'epoch': '2'}

100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 24/24 [05:37<00:00, 13.76s/it][2026-03-27 09:58:13,284] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-24


Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.19s/it]
Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.19s/it]

                                                                                                                                                                                                                    
{'train_runtime': '347.4', 'train_samples_per_second': '0.069', 'train_steps_per_second': '0.069', 'train_loss': '2.675', 'memory/max_active (GiB)': '8.52', 'memory/max_allocated (GiB)': '8.52', 'memory/device_reserved (GiB)': '20.65', 'epoch': '2', 'tokens/train_per_sec_per_gpu': '0'}

100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 24/24 [05:47<00:00, 13.76s/it]
100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 24/24 [05:47<00:00, 14.47s/it]
[2026-03-27 09:58:23,361] [INFO] [axolotl.train.save_trained_model:241] [PID:64102] Training completed! Saving trained model to ./outputs/out.

Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.04s/it]
Writing model shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1/1 [00:09<00:00,  9.04s/it]
[2026-03-27 09:58:32,450] [INFO] [axolotl.train.save_trained_model:355] [PID:64102] Model successfully saved to ./outputs/out