File size: 208,368 Bytes
9604973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
[2025-11-27 00:21:02,496] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:80269] baseline 0.000GB ()
[2025-11-27 00:21:02,496] [INFO] [axolotl.cli.config.load_cfg:248] [PID:80269] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "seedcoder.yaml",
  "base_model": "ByteDance-Seed/Seed-Coder-8B-Instruct",
  "base_model_config": "ByteDance-Seed/Seed-Coder-8B-Instruct",
  "batch_size": 128,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 8,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 8,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 208,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "new_data_clean.jsonl",
      "roles": {
        "assistant": [
          "assistant"
        ],
        "system": [
          "system"
        ],
        "user": [
          "user"
        ]
      },
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": true,
  "deepspeed": {
    "bf16": {
      "enabled": "auto"
    },
    "fp16": {
      "auto_cast": false,
      "enabled": "auto",
      "hysteresis": 2,
      "initial_scale_power": 32,
      "loss_scale": 0,
      "loss_scale_window": 1000,
      "min_loss_scale": 1
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false,
    "zero_optimization": {
      "contiguous_gradients": true,
      "offload_optimizer": {
        "device": "cpu"
      },
      "overlap_comm": true,
      "stage": 2
    }
  },
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 16,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": false
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": true,
  "is_mistral_derived_model": false,
  "learning_rate": 0.0001,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 64,
  "lora_dropout": 0.05,
  "lora_r": 64,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 16,
  "model_config_type": "llama",
  "num_epochs": 1.0,
  "optimizer": "adamw_torch",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./nov262025-sc-LoRA-Run",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 60,
  "save_total_limit": 100,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "ByteDance-Seed/Seed-Coder-8B-Instruct",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_entity": "test-aa",
  "wandb_name": "nov-26-sc-lor-run-1",
  "wandb_project": "seedcoder",
  "warmup_ratio": 0.05,
  "weight_decay": 0.0,
  "world_size": 8
}
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None
[2025-11-27 00:21:41,317] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:80269] Loading prepared dataset from disk at last_run_prepared/683f1b6addffef1a6c101561a46fc077...

Loading dataset from disk:   0%|                                                                                                                                      | 0/110 [00:00<?, ?it/s]
Loading dataset from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 82772.41it/s]
[2025-11-27 00:21:41,594] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:80269] total_num_tokens: 75_959_959
[2025-11-27 00:21:42,244] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:80269] `total_supervised_tokens: 5_309_191`
[2025-11-27 00:21:42,245] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:80269] total_num_steps: 221
[2025-11-27 00:21:42,245] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:80269] Maximum number of steps set at 221
[2025-11-27 00:21:42,270] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:80269] Loading tokenizer... ByteDance-Seed/Seed-Coder-8B-Instruct
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:80269] Loading model
[2025-11-27 00:21:42,935] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:80269] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-11-27 00:21:42,937] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:80269] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-11-27 00:21:42,955] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:80269] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}

Loading checkpoint shards:   0%|                                                                                                                                        | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards:  25%|████████████████████████████████                                                                                                | 1/4 [00:01<00:03,  1.31s/it]
Loading checkpoint shards:  50%|████████████████████████████████████████████████████████████████                                                                | 2/4 [00:02<00:02,  1.27s/it]
Loading checkpoint shards:  75%|████████████████████████████████████████████████████████████████████████████████████████████████                                | 3/4 [00:03<00:01,  1.28s/it]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.07it/s]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.06s/it]
[2025-11-27 00:22:10,720] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:80269] Converting modules to torch.bfloat16
[2025-11-27 00:22:10,723] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] Memory usage after model load 18.938GB (+18.938GB allocated, +20.139GB reserved)
[2025-11-27 00:22:10,724] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:80269] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 167,772,160 || all params: 8,418,234,368 || trainable%: 1.9930
[2025-11-27 00:22:12,070] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] after adapters 16.002GB (+16.002GB allocated, +20.436GB reserved)
[2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:398] [PID:80269] Pre-saving adapter config to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:402] [PID:80269] Pre-saving tokenizer to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,712] [INFO] [axolotl.train.save_initial_configs:407] [PID:80269] Pre-saving model config to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,716] [INFO] [axolotl.train.execute_training:196] [PID:80269] Starting trainer...
Time to load cpu_adam op: 2.386819839477539 seconds
wandb: Currently logged in as: pandyamarut (test-aa) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: ⢿ Waiting for wandb.init()...

m
wandb: ⣻ Waiting for wandb.init()...

m
wandb: Tracking run with wandb version 0.22.3
wandb: Run data is saved locally in /osmosis/wandb/run-20251127_002220-5un64tuw
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run nov-26-sc-lor-run-1
wandb: ⭐️ View project at https://wandb.ai/test-aa/seedcoder
wandb: 🚀 View run at https://wandb.ai/test-aa/seedcoder/runs/5un64tuw
wandb: Detected [huggingface_hub.inference] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-11-27 00:22:21,580] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:80269] The Axolotl config has been saved to the WandB run under files.
[2025-11-27 00:22:22,118] [INFO] [axolotl.utils.callbacks.on_train_begin:820] [PID:80269] The DeepSpeed config has been saved to the WandB run under files.

  0%|                                                                                                                                                                 | 0/221 [00:00<?, ?it/s]
  0%|▋                                                                                                                                                        | 1/221 [00:10<37:28, 10.22s/it]
                                                                                                                                                                                              
{'loss': 0.0756, 'grad_norm': 0.10699598491191864, 'learning_rate': 0.0, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.81, 'tokens_per_second_per_gpu': 336.12, 'epoch': 0.0}

  0%|▋                                                                                                                                                        | 1/221 [00:10<37:28, 10.22s/it]
  1%|█▍                                                                                                                                                       | 2/221 [00:17<31:29,  8.63s/it]
                                                                                                                                                                                              
{'loss': 0.069, 'grad_norm': 0.10141075402498245, 'learning_rate': 9.090909090909091e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 499.67, 'epoch': 0.01}

  1%|█▍                                                                                                                                                       | 2/221 [00:17<31:29,  8.63s/it]
  1%|██                                                                                                                                                       | 3/221 [00:24<28:45,  7.92s/it]
                                                                                                                                                                                              
{'loss': 0.0881, 'grad_norm': 0.11926735192537308, 'learning_rate': 1.8181818181818182e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 511.33, 'epoch': 0.01}

  1%|██                                                                                                                                                       | 3/221 [00:24<28:45,  7.92s/it]
  2%|██▊                                                                                                                                                      | 4/221 [00:31<27:22,  7.57s/it]
                                                                                                                                                                                              
{'loss': 0.0793, 'grad_norm': 0.11467798799276352, 'learning_rate': 2.7272727272727273e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.69, 'epoch': 0.02}

  2%|██▊                                                                                                                                                      | 4/221 [00:31<27:22,  7.57s/it]
  2%|███▍                                                                                                                                                     | 5/221 [00:38<26:33,  7.38s/it]
                                                                                                                                                                                              
{'loss': 0.0894, 'grad_norm': 0.10370815545320511, 'learning_rate': 3.6363636363636364e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 327.47, 'epoch': 0.02}

  2%|███▍                                                                                                                                                     | 5/221 [00:38<26:33,  7.38s/it]
  3%|████▏                                                                                                                                                    | 6/221 [00:45<26:02,  7.27s/it]
                                                                                                                                                                                              
{'loss': 0.0653, 'grad_norm': 0.0778045579791069, 'learning_rate': 4.545454545454546e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 438.39, 'epoch': 0.03}

  3%|████▏                                                                                                                                                    | 6/221 [00:45<26:02,  7.27s/it]
  3%|████▊                                                                                                                                                    | 7/221 [00:53<25:48,  7.23s/it]
                                                                                                                                                                                              
{'loss': 0.0604, 'grad_norm': 0.05470091104507446, 'learning_rate': 5.4545454545454546e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 454.21, 'epoch': 0.03}

  3%|████▊                                                                                                                                                    | 7/221 [00:53<25:48,  7.23s/it]
  4%|█████▌                                                                                                                                                   | 8/221 [01:00<25:36,  7.21s/it]
                                                                                                                                                                                              
{'loss': 0.0575, 'grad_norm': 0.04792458191514015, 'learning_rate': 6.363636363636364e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 381.85, 'epoch': 0.04}

  4%|█████▌                                                                                                                                                   | 8/221 [01:00<25:36,  7.21s/it]
  4%|██████▏                                                                                                                                                  | 9/221 [01:07<25:21,  7.18s/it]
                                                                                                                                                                                              
{'loss': 0.057, 'grad_norm': 0.04809016361832619, 'learning_rate': 7.272727272727273e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 395.18, 'epoch': 0.04}

  4%|██████▏                                                                                                                                                  | 9/221 [01:07<25:21,  7.18s/it]
  5%|██████▉                                                                                                                                                 | 10/221 [01:14<25:06,  7.14s/it]
                                                                                                                                                                                              
{'loss': 0.0472, 'grad_norm': 0.05050504207611084, 'learning_rate': 8.181818181818183e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 437.16, 'epoch': 0.05}

  5%|██████▉                                                                                                                                                 | 10/221 [01:14<25:06,  7.14s/it]
  5%|███████▌                                                                                                                                                | 11/221 [01:21<25:01,  7.15s/it]
                                                                                                                                                                                              
{'loss': 0.0422, 'grad_norm': 0.057043518871068954, 'learning_rate': 9.090909090909092e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 386.23, 'epoch': 0.05}

  5%|███████▌                                                                                                                                                | 11/221 [01:21<25:01,  7.15s/it]
  5%|████████▎                                                                                                                                               | 12/221 [01:28<25:00,  7.18s/it]
                                                                                                                                                                                              
{'loss': 0.033, 'grad_norm': 0.04036800563335419, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 309.64, 'epoch': 0.05}

  5%|████████▎                                                                                                                                               | 12/221 [01:28<25:00,  7.18s/it]
  6%|████████▉                                                                                                                                               | 13/221 [01:35<24:46,  7.15s/it]
                                                                                                                                                                                              
{'loss': 0.0429, 'grad_norm': 0.03150289133191109, 'learning_rate': 9.999440509051368e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 452.59, 'epoch': 0.06}

  6%|████████▉                                                                                                                                               | 13/221 [01:35<24:46,  7.15s/it]
  6%|█████████▋                                                                                                                                              | 14/221 [01:43<24:39,  7.15s/it]
                                                                                                                                                                                              
{'loss': 0.0381, 'grad_norm': 0.03723820298910141, 'learning_rate': 9.997762161417517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 365.29, 'epoch': 0.06}

  6%|█████████▋                                                                                                                                              | 14/221 [01:43<24:39,  7.15s/it]
  7%|██████▋                                                                                           | 15/221 [01:50<24:29,  7.13s/it]                                                      
                                                                                                                                        
{'loss': 0.0445, 'grad_norm': 0.026561176404356956, 'learning_rate': 9.994965332706573e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 383.32, 'epoch': 0.07}

  7%|██████▋                                                                                           | 15/221 [01:50<24:29,  7.13s/it]
  7%|███████                                                                                           | 16/221 [01:57<24:21,  7.13s/it]
                                                                                                                                        
{'loss': 0.0347, 'grad_norm': 0.022102832794189453, 'learning_rate': 9.991050648838675e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 571.37, 'epoch': 0.07}

  7%|███████                                                                                           | 16/221 [01:57<24:21,  7.13s/it]
  8%|███████▌                                                                                          | 17/221 [02:04<24:15,  7.14s/it]
                                                                                                                                        
{'loss': 0.0332, 'grad_norm': 0.026612414047122, 'learning_rate': 9.986018985905901e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 371.09, 'epoch': 0.08}

  8%|███████▌                                                                                          | 17/221 [02:04<24:15,  7.14s/it]
  8%|███████▉                                                                                          | 18/221 [02:11<24:04,  7.11s/it]
                                                                                                                                        
{'loss': 0.0384, 'grad_norm': 0.02820519544184208, 'learning_rate': 9.979871469976196e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 359.8, 'epoch': 0.08}

  8%|███████▉                                                                                          | 18/221 [02:11<24:04,  7.11s/it]
  9%|████████▍                                                                                         | 19/221 [02:18<23:55,  7.11s/it]
                                                                                                                                        
{'loss': 0.0306, 'grad_norm': 0.06290236860513687, 'learning_rate': 9.972609476841367e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 255.21, 'epoch': 0.09}

  9%|████████▍                                                                                         | 19/221 [02:18<23:55,  7.11s/it]
  9%|████████▊                                                                                         | 20/221 [02:25<23:49,  7.11s/it]
                                                                                                                                        
{'loss': 0.0328, 'grad_norm': 0.02102799527347088, 'learning_rate': 9.964234631709187e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 388.05, 'epoch': 0.09}

  9%|████████▊                                                                                         | 20/221 [02:25<23:49,  7.11s/it]
 10%|█████████▎                                                                                        | 21/221 [02:32<23:46,  7.13s/it]
                                                                                                                                        
{'loss': 0.0336, 'grad_norm': 0.022319750860333443, 'learning_rate': 9.954748808839674e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 496.89, 'epoch': 0.1}

 10%|█████████▎                                                                                        | 21/221 [02:32<23:46,  7.13s/it]
 10%|█████████▊                                                                                        | 22/221 [02:40<23:41,  7.14s/it]
                                                                                                                                        
{'loss': 0.0349, 'grad_norm': 0.019568774849176407, 'learning_rate': 9.944154131125642e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 417.97, 'epoch': 0.1}

 10%|█████████▊                                                                                        | 22/221 [02:40<23:41,  7.14s/it]
 10%|██████████▏                                                                                       | 23/221 [02:47<23:30,  7.12s/it]
                                                                                                                                        
{'loss': 0.0382, 'grad_norm': 0.04317627474665642, 'learning_rate': 9.932452969617607e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 377.6, 'epoch': 0.1}

 10%|██████████▏                                                                                       | 23/221 [02:47<23:30,  7.12s/it]
 11%|██████████▋                                                                                       | 24/221 [02:54<23:32,  7.17s/it]
                                                                                                                                        
{'loss': 0.0361, 'grad_norm': 0.027220861986279488, 'learning_rate': 9.919647942993148e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 468.41, 'epoch': 0.11}

 11%|██████████▋                                                                                       | 24/221 [02:54<23:32,  7.17s/it]
 11%|███████████                                                                                       | 25/221 [03:01<23:27,  7.18s/it]
                                                                                                                                        
{'loss': 0.031, 'grad_norm': 0.019090518355369568, 'learning_rate': 9.905741916970864e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.96, 'epoch': 0.11}

 11%|███████████                                                                                       | 25/221 [03:01<23:27,  7.18s/it]
 12%|███████████▌                                                                                      | 26/221 [03:09<23:34,  7.26s/it]
                                                                                                                                        
{'loss': 0.0316, 'grad_norm': 0.019753405824303627, 'learning_rate': 9.890738003669029e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 385.75, 'epoch': 0.12}

 12%|███████████▌                                                                                      | 26/221 [03:09<23:34,  7.26s/it]
 12%|███████████▉                                                                                      | 27/221 [03:16<23:29,  7.26s/it]
                                                                                                                                        
{'loss': 0.0402, 'grad_norm': 0.021183036267757416, 'learning_rate': 9.874639560909117e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 481.45, 'epoch': 0.12}

 12%|███████████▉                                                                                      | 27/221 [03:16<23:29,  7.26s/it]
 13%|████████████▍                                                                                     | 28/221 [03:23<23:16,  7.23s/it]
                                                                                                                                        
{'loss': 0.0312, 'grad_norm': 0.018204571679234505, 'learning_rate': 9.857450191464337e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 317.6, 'epoch': 0.13}

 13%|████████████▍                                                                                     | 28/221 [03:23<23:16,  7.23s/it]
 13%|████████████▊                                                                                     | 29/221 [03:30<23:06,  7.22s/it]
                                                                                                                                        
{'loss': 0.0343, 'grad_norm': 0.02151501551270485, 'learning_rate': 9.839173742253334e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 471.66, 'epoch': 0.13}

 13%|████████████▊                                                                                     | 29/221 [03:30<23:06,  7.22s/it]
 14%|█████████████▎                                                                                    | 30/221 [03:37<22:51,  7.18s/it]
                                                                                                                                        
{'loss': 0.0337, 'grad_norm': 0.021778756752610207, 'learning_rate': 9.819814303479267e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 391.67, 'epoch': 0.14}

 14%|█████████████▎                                                                                    | 30/221 [03:37<22:51,  7.18s/it]
 14%|█████████████▋                                                                                    | 31/221 [03:45<22:53,  7.23s/it]
                                                                                                                                        
{'loss': 0.0264, 'grad_norm': 0.01405468862503767, 'learning_rate': 9.799376207714445e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 441.4, 'epoch': 0.14}

 14%|█████████████▋                                                                                    | 31/221 [03:45<22:53,  7.23s/it]
 14%|██████████████▏                                                                                   | 32/221 [03:52<22:39,  7.19s/it]
                                                                                                                                        
{'loss': 0.0325, 'grad_norm': 0.0183633491396904, 'learning_rate': 9.777864028930705e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 449.02, 'epoch': 0.14}

 14%|██████████████▏                                                                                   | 32/221 [03:52<22:39,  7.19s/it]
 15%|██████████████▋                                                                                   | 33/221 [03:59<22:35,  7.21s/it]
                                                                                                                                        
{'loss': 0.0342, 'grad_norm': 0.022185783833265305, 'learning_rate': 9.755282581475769e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 356.6, 'epoch': 0.15}

 15%|██████████████▋                                                                                   | 33/221 [03:59<22:35,  7.21s/it]
 15%|███████████████                                                                                   | 34/221 [04:06<22:21,  7.17s/it]
                                                                                                                                        
{'loss': 0.0255, 'grad_norm': 0.015691177919507027, 'learning_rate': 9.731636918995821e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 461.92, 'epoch': 0.15}

 15%|███████████████                                                                                   | 34/221 [04:06<22:21,  7.17s/it]
 16%|███████████████▌                                                                                  | 35/221 [04:13<22:14,  7.17s/it]
                                                                                                                                        
{'loss': 0.0314, 'grad_norm': 0.01962122693657875, 'learning_rate': 9.706932333304517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 428.5, 'epoch': 0.16}

 16%|███████████████▌                                                                                  | 35/221 [04:13<22:14,  7.17s/it]
 16%|███████████████▉                                                                                  | 36/221 [04:20<22:05,  7.17s/it]
                                                                                                                                        
{'loss': 0.0396, 'grad_norm': 0.01783391274511814, 'learning_rate': 9.681174353198687e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 393.65, 'epoch': 0.16}

 16%|███████████████▉                                                                                  | 36/221 [04:20<22:05,  7.17s/it]
 17%|████████████████▍                                                                                 | 37/221 [04:28<21:57,  7.16s/it]
                                                                                                                                        
{'loss': 0.0351, 'grad_norm': 0.020520439371466637, 'learning_rate': 9.654368743221022e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 331.18, 'epoch': 0.17}

 17%|████████████████▍                                                                                 | 37/221 [04:28<21:57,  7.16s/it]
 17%|████████████████▊                                                                                 | 38/221 [04:35<21:47,  7.14s/it]
                                                                                                                                        
{'loss': 0.0341, 'grad_norm': 0.019169267266988754, 'learning_rate': 9.626521502369984e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 429.02, 'epoch': 0.17}

 17%|████████████████▊                                                                                 | 38/221 [04:35<21:47,  7.14s/it]
 18%|█████████████████▎                                                                                | 39/221 [04:42<21:41,  7.15s/it]
                                                                                                                                        
{'loss': 0.0351, 'grad_norm': 0.02138075977563858, 'learning_rate': 9.597638862757255e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 491.69, 'epoch': 0.18}

 18%|█████████████████▎                                                                                | 39/221 [04:42<21:41,  7.15s/it]
 18%|█████████████████▋                                                                                | 40/221 [04:49<21:37,  7.17s/it]
                                                                                                                                        
{'loss': 0.0338, 'grad_norm': 0.0176653191447258, 'learning_rate': 9.567727288213005e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.26, 'epoch': 0.18}

 18%|█████████████████▋                                                                                | 40/221 [04:49<21:37,  7.17s/it]
 19%|██████████████████▏                                                                               | 41/221 [04:56<21:29,  7.17s/it]
                                                                                                                                        
{'loss': 0.0316, 'grad_norm': 0.017243385314941406, 'learning_rate': 9.536793472839325e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.46, 'epoch': 0.19}

 19%|██████████████████▏                                                                               | 41/221 [04:56<21:29,  7.17s/it]
 19%|██████████████████▌                                                                               | 42/221 [05:03<21:18,  7.14s/it]
                                                                                                                                        
{'loss': 0.0216, 'grad_norm': 0.0146207669749856, 'learning_rate': 9.504844339512095e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 363.65, 'epoch': 0.19}

 19%|██████████████████▌                                                                               | 42/221 [05:03<21:18,  7.14s/it]
 19%|███████████████████                                                                               | 43/221 [05:10<21:11,  7.14s/it]
                                                                                                                                        
{'loss': 0.0335, 'grad_norm': 0.017516395077109337, 'learning_rate': 9.471887038331685e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.46, 'epoch': 0.19}

 19%|███████████████████                                                                               | 43/221 [05:10<21:11,  7.14s/it]
 20%|███████████████████▌                                                                              | 44/221 [05:18<21:05,  7.15s/it]
                                                                                                                                        
{'loss': 0.0313, 'grad_norm': 0.01834929920732975, 'learning_rate': 9.437928945022771e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.41, 'epoch': 0.2}

 20%|███████████████████▌                                                                              | 44/221 [05:18<21:05,  7.15s/it]
 20%|███████████████████▉                                                                              | 45/221 [05:25<20:58,  7.15s/it]
                                                                                                                                        
{'loss': 0.0312, 'grad_norm': 0.018410420045256615, 'learning_rate': 9.40297765928369e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 414.21, 'epoch': 0.2}

 20%|███████████████████▉                                                                              | 45/221 [05:25<20:58,  7.15s/it]
 21%|████████████████████▍                                                                             | 46/221 [05:32<20:43,  7.11s/it]
                                                                                                                                        
{'loss': 0.0335, 'grad_norm': 0.043539997190237045, 'learning_rate': 9.367041003085649e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 319.79, 'epoch': 0.21}

 21%|████████████████████▍                                                                             | 46/221 [05:32<20:43,  7.11s/it]
 21%|████████████████████▊                                                                             | 47/221 [05:39<20:40,  7.13s/it]
                                                                                                                                        
{'loss': 0.0307, 'grad_norm': 0.019783005118370056, 'learning_rate': 9.330127018922194e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 262.29, 'epoch': 0.21}

 21%|████████████████████▊                                                                             | 47/221 [05:39<20:40,  7.13s/it]
 22%|█████████████████████▎                                                                            | 48/221 [05:46<20:31,  7.12s/it]
                                                                                                                                        
{'loss': 0.0308, 'grad_norm': 0.018382525071501732, 'learning_rate': 9.292243968009331e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.22}

 22%|█████████████████████▎                                                                            | 48/221 [05:46<20:31,  7.12s/it]
 22%|█████████████████████▋                                                                            | 49/221 [05:53<20:25,  7.13s/it]
                                                                                                                                        
{'loss': 0.0338, 'grad_norm': 0.021564122289419174, 'learning_rate': 9.253400328436699e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 484.09, 'epoch': 0.22}

 22%|█████████████████████▋                                                                            | 49/221 [05:53<20:25,  7.13s/it]
 23%|██████████████████████▏                                                                           | 50/221 [06:00<20:16,  7.12s/it]
                                                                                                                                        
{'loss': 0.0285, 'grad_norm': 0.016710789874196053, 'learning_rate': 9.213604793270196e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.23}

 23%|██████████████████████▏                                                                           | 50/221 [06:00<20:16,  7.12s/it]
 23%|██████████████████████▌                                                                           | 51/221 [06:07<20:10,  7.12s/it]
                                                                                                                                        
{'loss': 0.0293, 'grad_norm': 0.016314025968313217, 'learning_rate': 9.172866268606513e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.09, 'epoch': 0.23}

 23%|██████████████████████▌                                                                           | 51/221 [06:07<20:10,  7.12s/it]
 24%|███████████████████████                                                                           | 52/221 [06:14<20:02,  7.12s/it]
                                                                                                                                        
{'loss': 0.0273, 'grad_norm': 0.01764376275241375, 'learning_rate': 9.131193871579975e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 558.97, 'epoch': 0.24}

 24%|███████████████████████                                                                           | 52/221 [06:14<20:02,  7.12s/it]
 24%|███████████████████████▌                                                                          | 53/221 [06:21<19:49,  7.08s/it]
                                                                                                                                        
{'loss': 0.0245, 'grad_norm': 0.01896459050476551, 'learning_rate': 9.088596928322158e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.06, 'epoch': 0.24}

 24%|███████████████████████▌                                                                          | 53/221 [06:21<19:49,  7.08s/it]
 24%|███████████████████████▉                                                                          | 54/221 [06:29<19:48,  7.12s/it]
                                                                                                                                        
{'loss': 0.0301, 'grad_norm': 0.01687958650290966, 'learning_rate': 9.045084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 296.5, 'epoch': 0.24}

 24%|███████████████████████▉                                                                          | 54/221 [06:29<19:48,  7.12s/it]
 25%|████████████████████████▍                                                                         | 55/221 [06:36<19:41,  7.12s/it]
                                                                                                                                        
{'loss': 0.0341, 'grad_norm': 0.021479196846485138, 'learning_rate': 9.000667740056032e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.19, 'epoch': 0.25}

 25%|████████████████████████▍                                                                         | 55/221 [06:36<19:41,  7.12s/it]
 25%|████████████████████████▊                                                                         | 56/221 [06:43<19:39,  7.15s/it]
                                                                                                                                        
{'loss': 0.0271, 'grad_norm': 0.016711527481675148, 'learning_rate': 8.955355173281708e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.2, 'epoch': 0.25}

 25%|████████████████████████▊                                                                         | 56/221 [06:43<19:39,  7.15s/it]
 26%|█████████████████████████▎                                                                        | 57/221 [06:50<19:37,  7.18s/it]
                                                                                                                                        
{'loss': 0.0287, 'grad_norm': 0.01733219437301159, 'learning_rate': 8.90915741234015e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.43, 'epoch': 0.26}

 26%|█████████████████████████▎                                                                        | 57/221 [06:50<19:37,  7.18s/it]
 26%|█████████████████████████▋                                                                        | 58/221 [06:57<19:27,  7.16s/it]
                                                                                                                                        
{'loss': 0.0254, 'grad_norm': 0.01601138710975647, 'learning_rate': 8.862084796122998e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.26}

 26%|█████████████████████████▋                                                                        | 58/221 [06:57<19:27,  7.16s/it]
 27%|██████████████████████████▏                                                                       | 59/221 [07:04<19:16,  7.14s/it]
                                                                                                                                        
{'loss': 0.0311, 'grad_norm': 0.018599703907966614, 'learning_rate': 8.814147859311332e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.27}

 27%|██████████████████████████▏                                                                       | 59/221 [07:04<19:16,  7.14s/it]
 27%|██████████████████████████▌                                                                       | 60/221 [07:12<19:09,  7.14s/it]
                                                                                                                                        
{'loss': 0.0309, 'grad_norm': 0.01615062914788723, 'learning_rate': 8.765357330018056e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.15, 'epoch': 0.27}

 27%|██████████████████████████▌                                                                       | 60/221 [07:12<19:09,  7.14s/it][2025-11-27 00:29:44,068] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-60

 28%|███████████████████████████                                                                       | 61/221 [07:31<28:50, 10.82s/it]
                                                                                                                                        
{'loss': 0.0245, 'grad_norm': 0.01592225581407547, 'learning_rate': 8.715724127386972e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.76, 'epoch': 0.28}

 28%|███████████████████████████                                                                       | 61/221 [07:31<28:50, 10.82s/it]
 28%|███████████████████████████▍                                                                      | 62/221 [07:38<25:36,  9.66s/it]
                                                                                                                                        
{'loss': 0.0295, 'grad_norm': 0.02008405141532421, 'learning_rate': 8.665259359149132e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 447.91, 'epoch': 0.28}

 28%|███████████████████████████▍                                                                      | 62/221 [07:38<25:36,  9.66s/it]
 29%|███████████████████████████▉                                                                      | 63/221 [07:45<23:25,  8.90s/it]
                                                                                                                                        
{'loss': 0.0366, 'grad_norm': 0.019492069259285927, 'learning_rate': 8.613974319136958e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 405.86, 'epoch': 0.29}

 29%|███████████████████████████▉                                                                      | 63/221 [07:45<23:25,  8.90s/it]
 29%|████████████████████████████▍                                                                     | 64/221 [07:52<21:51,  8.36s/it]
                                                                                                                                        
{'loss': 0.0284, 'grad_norm': 0.02178225666284561, 'learning_rate': 8.561880484756725e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.25, 'epoch': 0.29}

 29%|████████████████████████████▍                                                                     | 64/221 [07:52<21:51,  8.36s/it]
 29%|████████████████████████████▊                                                                     | 65/221 [07:59<20:46,  7.99s/it]
                                                                                                                                        
{'loss': 0.0297, 'grad_norm': 0.019038653001189232, 'learning_rate': 8.508989514419958e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 345.76, 'epoch': 0.29}

 29%|████████████████████████████▊                                                                     | 65/221 [07:59<20:46,  7.99s/it]
 30%|█████████████████████████████▎                                                                    | 66/221 [08:07<20:04,  7.77s/it]
                                                                                                                                        
{'loss': 0.0293, 'grad_norm': 0.01684654876589775, 'learning_rate': 8.455313244934324e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.88, 'epoch': 0.3}

 30%|█████████████████████████████▎                                                                    | 66/221 [08:07<20:04,  7.77s/it]
 30%|█████████████████████████████▋                                                                    | 67/221 [08:14<19:25,  7.57s/it]
                                                                                                                                        
{'loss': 0.0239, 'grad_norm': 0.01636500470340252, 'learning_rate': 8.400863688854597e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.77, 'epoch': 0.3}

 30%|█████████████████████████████▋                                                                    | 67/221 [08:14<19:25,  7.57s/it]
 31%|██████████████████████████████▏                                                                   | 68/221 [08:21<18:56,  7.43s/it]
                                                                                                                                        
{'loss': 0.0263, 'grad_norm': 0.020848819985985756, 'learning_rate': 8.345653031794292e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 365.36, 'epoch': 0.31}

 31%|██████████████████████████████▏                                                                   | 68/221 [08:21<18:56,  7.43s/it]
 31%|██████████████████████████████▌                                                                   | 69/221 [08:28<18:33,  7.33s/it]
                                                                                                                                        
{'loss': 0.0355, 'grad_norm': 0.02269025892019272, 'learning_rate': 8.289693629698564e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.76, 'epoch': 0.31}

 31%|██████████████████████████████▌                                                                   | 69/221 [08:28<18:33,  7.33s/it]
 32%|███████████████████████████████                                                                   | 70/221 [08:35<18:16,  7.26s/it]
                                                                                                                                        
{'loss': 0.0284, 'grad_norm': 0.01883563958108425, 'learning_rate': 8.232998006078997e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.65, 'epoch': 0.32}

 32%|███████████████████████████████                                                                   | 70/221 [08:35<18:16,  7.26s/it]
 32%|███████████████████████████████▍                                                                  | 71/221 [08:42<17:58,  7.19s/it]
                                                                                                                                        
{'loss': 0.0261, 'grad_norm': 0.017690833657979965, 'learning_rate': 8.175578849210895e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 441.77, 'epoch': 0.32}

 32%|███████████████████████████████▍                                                                  | 71/221 [08:42<17:58,  7.19s/it]
 33%|███████████████████████████████▉                                                                  | 72/221 [08:49<17:49,  7.17s/it]
                                                                                                                                        
{'loss': 0.0282, 'grad_norm': 0.018213583156466484, 'learning_rate': 8.117449009293668e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.18, 'epoch': 0.33}

 33%|███████████████████████████████▉                                                                  | 72/221 [08:49<17:49,  7.17s/it]
 33%|████████████████████████████████▎                                                                 | 73/221 [08:56<17:36,  7.14s/it]
                                                                                                                                        
{'loss': 0.0309, 'grad_norm': 0.019336581230163574, 'learning_rate': 8.058621495575032e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.23, 'epoch': 0.33}

 33%|████████████████████████████████▎                                                                 | 73/221 [08:56<17:36,  7.14s/it]
 33%|████████████████████████████████▊                                                                 | 74/221 [09:03<17:25,  7.11s/it]
                                                                                                                                        
{'loss': 0.0312, 'grad_norm': 0.019243910908699036, 'learning_rate': 7.999109473439569e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.43, 'epoch': 0.33}

 33%|████████████████████████████████▊                                                                 | 74/221 [09:03<17:25,  7.11s/it]
 34%|█████████████████████████████████▎                                                                | 75/221 [09:10<17:17,  7.11s/it]
                                                                                                                                        
{'loss': 0.0317, 'grad_norm': 0.01973150111734867, 'learning_rate': 7.938926261462366e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.63, 'epoch': 0.34}

 34%|█████████████████████████████████▎                                                                | 75/221 [09:10<17:17,  7.11s/it]
 34%|█████████████████████████████████▋                                                                | 76/221 [09:18<17:15,  7.14s/it]
                                                                                                                                        
{'loss': 0.0293, 'grad_norm': 0.02182990498840809, 'learning_rate': 7.878085328428369e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.11, 'epoch': 0.34}

 34%|█████████████████████████████████▋                                                                | 76/221 [09:18<17:15,  7.14s/it]
 35%|██████████████████████████████████▏                                                               | 77/221 [09:25<17:06,  7.13s/it]
                                                                                                                                        
{'loss': 0.0287, 'grad_norm': 0.018669869750738144, 'learning_rate': 7.81660029031811e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 408.44, 'epoch': 0.35}

 35%|██████████████████████████████████▏                                                               | 77/221 [09:25<17:06,  7.13s/it]
 35%|██████████████████████████████████▌                                                               | 78/221 [09:32<16:59,  7.13s/it]
                                                                                                                                        
{'loss': 0.0315, 'grad_norm': 0.016969047486782074, 'learning_rate': 7.754484907260513e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.64, 'epoch': 0.35}

 35%|██████████████████████████████████▌                                                               | 78/221 [09:32<16:59,  7.13s/it]
 36%|███████████████████████████████████                                                               | 79/221 [09:39<16:48,  7.10s/it]
                                                                                                                                        
{'loss': 0.0278, 'grad_norm': 0.019713636487722397, 'learning_rate': 7.691753080453412e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 397.76, 'epoch': 0.36}

 36%|███████████████████████████████████                                                               | 79/221 [09:39<16:48,  7.10s/it]
 36%|███████████████████████████████████▍                                                              | 80/221 [09:46<16:45,  7.13s/it]
                                                                                                                                        
{'loss': 0.0255, 'grad_norm': 0.017600620165467262, 'learning_rate': 7.628418849052523e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.33, 'epoch': 0.36}

 36%|███████████████████████████████████▍                                                              | 80/221 [09:46<16:45,  7.13s/it]
 37%|███████████████████████████████████▉                                                              | 81/221 [09:53<16:36,  7.12s/it]
                                                                                                                                        
{'loss': 0.0234, 'grad_norm': 0.018615400418639183, 'learning_rate': 7.564496387029532e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 352.61, 'epoch': 0.37}

 37%|███████████████████████████████████▉                                                              | 81/221 [09:53<16:36,  7.12s/it]
 37%|████████████████████████████████████▎                                                             | 82/221 [10:00<16:27,  7.11s/it]
                                                                                                                                        
{'loss': 0.0234, 'grad_norm': 0.023312179371714592, 'learning_rate': 7.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 499.51, 'epoch': 0.37}

 37%|████████████████████████████████████▎                                                             | 82/221 [10:00<16:27,  7.11s/it]
 38%|████████████████████████████████████▊                                                             | 83/221 [10:07<16:23,  7.13s/it]
                                                                                                                                        
{'loss': 0.0321, 'grad_norm': 0.01922520436346531, 'learning_rate': 7.434944122021836e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 419.08, 'epoch': 0.38}

 38%|████████████████████████████████████▊                                                             | 83/221 [10:07<16:23,  7.13s/it]
 38%|█████████████████████████████████████▏                                                            | 84/221 [10:14<16:11,  7.09s/it]
                                                                                                                                        
{'loss': 0.0254, 'grad_norm': 0.019153179600834846, 'learning_rate': 7.369343312364993e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.78, 'epoch': 0.38}

 38%|█████████████████████████████████████▏                                                            | 84/221 [10:14<16:11,  7.09s/it]
 38%|█████████████████████████████████████▋                                                            | 85/221 [10:22<16:12,  7.15s/it]
                                                                                                                                        
{'loss': 0.0343, 'grad_norm': 0.020285822451114655, 'learning_rate': 7.303212252253162e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 371.99, 'epoch': 0.38}

 38%|█████████████████████████████████████▋                                                            | 85/221 [10:22<16:12,  7.15s/it]
 39%|██████████████████████████████████████▏                                                           | 86/221 [10:29<15:59,  7.11s/it]
                                                                                                                                        
{'loss': 0.0249, 'grad_norm': 0.016675548627972603, 'learning_rate': 7.236565741578163e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.7, 'epoch': 0.39}

 39%|██████████████████████████████████████▏                                                           | 86/221 [10:29<15:59,  7.11s/it]
 39%|██████████████████████████████████████▌                                                           | 87/221 [10:36<15:59,  7.16s/it]
                                                                                                                                        
{'loss': 0.0289, 'grad_norm': 0.015159820206463337, 'learning_rate': 7.169418695587791e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 344.64, 'epoch': 0.39}

 39%|██████████████████████████████████████▌                                                           | 87/221 [10:36<15:59,  7.16s/it]
 40%|███████████████████████████████████████                                                           | 88/221 [10:43<15:52,  7.16s/it]
                                                                                                                                        
{'loss': 0.0276, 'grad_norm': 0.018055099993944168, 'learning_rate': 7.101786141547828e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 375.32, 'epoch': 0.4}

 40%|███████████████████████████████████████                                                           | 88/221 [10:43<15:52,  7.16s/it]
 40%|███████████████████████████████████████▍                                                          | 89/221 [10:50<15:42,  7.14s/it]
                                                                                                                                        
{'loss': 0.0277, 'grad_norm': 0.01955697126686573, 'learning_rate': 7.033683215379002e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.83, 'epoch': 0.4}

 40%|███████████████████████████████████████▍                                                          | 89/221 [10:50<15:42,  7.14s/it]
 41%|███████████████████████████████████████▉                                                          | 90/221 [10:57<15:36,  7.15s/it]
                                                                                                                                        
{'loss': 0.0277, 'grad_norm': 0.01860162802040577, 'learning_rate': 6.965125158269619e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.84, 'epoch': 0.41}

 41%|███████████████████████████████████████▉                                                          | 90/221 [10:57<15:36,  7.15s/it]
 41%|████████████████████████████████████████▎                                                         | 91/221 [11:04<15:25,  7.12s/it]
                                                                                                                                        
{'loss': 0.0322, 'grad_norm': 0.02057529240846634, 'learning_rate': 6.896127313264643e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.85, 'epoch': 0.41}

 41%|████████████████████████████████████████▎                                                         | 91/221 [11:04<15:25,  7.12s/it]
 42%|████████████████████████████████████████▊                                                         | 92/221 [11:11<15:15,  7.09s/it]
                                                                                                                                        
{'loss': 0.0228, 'grad_norm': 0.017251698300242424, 'learning_rate': 6.826705121831976e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 483.38, 'epoch': 0.42}

 42%|████████████████████████████████████████▊                                                         | 92/221 [11:11<15:15,  7.09s/it]
 42%|█████████████████████████████████████████▏                                                        | 93/221 [11:19<15:21,  7.20s/it]
                                                                                                                                        
{'loss': 0.028, 'grad_norm': 0.017092842608690262, 'learning_rate': 6.756874120406714e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.82, 'epoch': 0.42}

 42%|█████████████████████████████████████████▏                                                        | 93/221 [11:19<15:21,  7.20s/it]
 43%|█████████████████████████████████████████▋                                                        | 94/221 [11:26<15:15,  7.21s/it]
                                                                                                                                        
{'loss': 0.0272, 'grad_norm': 0.01863975077867508, 'learning_rate': 6.686649936914152e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 455.96, 'epoch': 0.43}

 43%|█████████████████████████████████████████▋                                                        | 94/221 [11:26<15:15,  7.21s/it]
 43%|██████████████████████████████████████████▏                                                       | 95/221 [11:33<15:06,  7.19s/it]
                                                                                                                                        
{'loss': 0.0256, 'grad_norm': 0.019126810133457184, 'learning_rate': 6.616048287272301e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.43}

 43%|██████████████████████████████████████████▏                                                       | 95/221 [11:33<15:06,  7.19s/it]
 43%|██████████████████████████████████████████▌                                                       | 96/221 [11:40<14:59,  7.20s/it]
                                                                                                                                        
{'loss': 0.0293, 'grad_norm': 0.019856387749314308, 'learning_rate': 6.545084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 330.99, 'epoch': 0.43}

 43%|██████████████████████████████████████████▌                                                       | 96/221 [11:40<14:59,  7.20s/it]
 44%|███████████████████████████████████████████                                                       | 97/221 [11:48<14:48,  7.17s/it]
                                                                                                                                        
{'loss': 0.0298, 'grad_norm': 0.020938578993082047, 'learning_rate': 6.473775872054521e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.5, 'epoch': 0.44}

 44%|███████████████████████████████████████████                                                       | 97/221 [11:48<14:48,  7.17s/it]
 44%|███████████████████████████████████████████▍                                                      | 98/221 [11:55<14:41,  7.17s/it]
                                                                                                                                        
{'loss': 0.0213, 'grad_norm': 0.01743321865797043, 'learning_rate': 6.402136946530014e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.25, 'epoch': 0.44}

 44%|███████████████████████████████████████████▍                                                      | 98/221 [11:55<14:41,  7.17s/it]
 45%|███████████████████████████████████████████▉                                                      | 99/221 [12:02<14:37,  7.19s/it]
                                                                                                                                        
{'loss': 0.0289, 'grad_norm': 0.03026910126209259, 'learning_rate': 6.330184227833376e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.35, 'epoch': 0.45}

 45%|███████████████████████████████████████████▉                                                      | 99/221 [12:02<14:37,  7.19s/it]
 45%|███████████████████████████████████████████▉                                                     | 100/221 [12:09<14:22,  7.13s/it]
                                                                                                                                        
{'loss': 0.0255, 'grad_norm': 0.021303489804267883, 'learning_rate': 6.257933818722543e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.68, 'epoch': 0.45}

 45%|███████████████████████████████████████████▉                                                     | 100/221 [12:09<14:22,  7.13s/it]
 46%|████████████████████████████████████████████▎                                                    | 101/221 [12:16<14:14,  7.12s/it]
                                                                                                                                        
{'loss': 0.0302, 'grad_norm': 0.018962478265166283, 'learning_rate': 6.185401888577488e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.8, 'epoch': 0.46}

 46%|████████████████████████████████████████████▎                                                    | 101/221 [12:16<14:14,  7.12s/it]
 46%|████████████████████████████████████████████▊                                                    | 102/221 [12:23<14:09,  7.14s/it]
                                                                                                                                        
{'loss': 0.0233, 'grad_norm': 0.01941424049437046, 'learning_rate': 6.112604669781572e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 468.02, 'epoch': 0.46}

 46%|████████████████████████████████████████████▊                                                    | 102/221 [12:23<14:09,  7.14s/it]
 47%|█████████████████████████████████████████████▏                                                   | 103/221 [12:30<14:02,  7.14s/it]
                                                                                                                                        
{'loss': 0.0295, 'grad_norm': 0.019837241619825363, 'learning_rate': 6.0395584540887963e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.34, 'epoch': 0.47}

 47%|█████████████████████████████████████████████▏                                                   | 103/221 [12:30<14:02,  7.14s/it]
 47%|█████████████████████████████████████████████▋                                                   | 104/221 [12:38<13:54,  7.13s/it]
                                                                                                                                        
{'loss': 0.0259, 'grad_norm': 0.018758604303002357, 'learning_rate': 5.9662795889777666e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 337.38, 'epoch': 0.47}

 47%|█████████████████████████████████████████████▋                                                   | 104/221 [12:38<13:54,  7.13s/it]
 48%|██████████████████████████████████████████████                                                   | 105/221 [12:45<13:43,  7.10s/it]
                                                                                                                                        
{'loss': 0.0274, 'grad_norm': 0.01769891194999218, 'learning_rate': 5.8927844739931834e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.82, 'epoch': 0.48}

 48%|██████████████████████████████████████████████                                                   | 105/221 [12:45<13:43,  7.10s/it]
 48%|██████████████████████████████████████████████▌                                                  | 106/221 [12:52<13:41,  7.14s/it]
                                                                                                                                        
{'loss': 0.0265, 'grad_norm': 0.017575478181242943, 'learning_rate': 5.819089557075689e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 473.38, 'epoch': 0.48}

 48%|██████████████████████████████████████████████▌                                                  | 106/221 [12:52<13:41,  7.14s/it]
 48%|██████████████████████████████████████████████▉                                                  | 107/221 [12:59<13:30,  7.11s/it]
                                                                                                                                        
{'loss': 0.0261, 'grad_norm': 0.017795337364077568, 'learning_rate': 5.745211330880872e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.08, 'epoch': 0.48}

 48%|██████████████████████████████████████████████▉                                                  | 107/221 [12:59<13:30,  7.11s/it]
 49%|███████████████████████████████████████████████▍                                                 | 108/221 [13:06<13:21,  7.09s/it]
                                                                                                                                        
{'loss': 0.0312, 'grad_norm': 0.021093547344207764, 'learning_rate': 5.6711663290882776e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.72, 'epoch': 0.49}

 49%|███████████████████████████████████████████████▍                                                 | 108/221 [13:06<13:21,  7.09s/it]
 49%|███████████████████████████████████████████████▊                                                 | 109/221 [13:13<13:10,  7.06s/it]
                                                                                                                                        
{'loss': 0.0238, 'grad_norm': 0.022809553891420364, 'learning_rate': 5.596971122701221e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 504.0, 'epoch': 0.49}

 49%|███████████████████████████████████████████████▊                                                 | 109/221 [13:13<13:10,  7.06s/it]
 50%|████████████████████████████████████████████████▎                                                | 110/221 [13:20<13:05,  7.08s/it]
                                                                                                                                        
{'loss': 0.0267, 'grad_norm': 0.018646899610757828, 'learning_rate': 5.522642316338268e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.5}

 50%|████████████████████████████████████████████████▎                                                | 110/221 [13:20<13:05,  7.08s/it]
 50%|████████████████████████████████████████████████▋                                                | 111/221 [13:27<13:01,  7.10s/it]
                                                                                                                                        
{'loss': 0.0253, 'grad_norm': 0.0172793660312891, 'learning_rate': 5.448196544517168e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.68, 'epoch': 0.5}

 50%|████████████████████████████████████████████████▋                                                | 111/221 [13:27<13:01,  7.10s/it]
 51%|█████████████████████████████████████████████████▏                                               | 112/221 [13:34<12:57,  7.14s/it]
                                                                                                                                        
{'loss': 0.0335, 'grad_norm': 0.019996505230665207, 'learning_rate': 5.373650467932122e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.18, 'epoch': 0.51}

 51%|█████████████████████████████████████████████████▏                                               | 112/221 [13:34<12:57,  7.14s/it]
 51%|█████████████████████████████████████████████████▌                                               | 113/221 [13:42<12:51,  7.14s/it]
                                                                                                                                        
{'loss': 0.0333, 'grad_norm': 0.017304031178355217, 'learning_rate': 5.299020769725172e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.94, 'epoch': 0.51}

 51%|█████████████████████████████████████████████████▌                                               | 113/221 [13:42<12:51,  7.14s/it]
 52%|██████████████████████████████████████████████████                                               | 114/221 [13:49<12:43,  7.13s/it]
                                                                                                                                        
{'loss': 0.027, 'grad_norm': 0.01827949658036232, 'learning_rate': 5.2243241517525754e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 286.28, 'epoch': 0.52}

 52%|██████████████████████████████████████████████████                                               | 114/221 [13:49<12:43,  7.13s/it]
 52%|██████████████████████████████████████████████████▍                                              | 115/221 [13:56<12:34,  7.12s/it]
                                                                                                                                        
{'loss': 0.0252, 'grad_norm': 0.018098153173923492, 'learning_rate': 5.149577330846993e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.73, 'epoch': 0.52}

 52%|██████████████████████████████████████████████████▍                                              | 115/221 [13:56<12:34,  7.12s/it]
 52%|██████████████████████████████████████████████████▉                                              | 116/221 [14:03<12:28,  7.13s/it]
                                                                                                                                        
{'loss': 0.0229, 'grad_norm': 0.015578909777104855, 'learning_rate': 5.074797035076319e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.63, 'epoch': 0.52}

 52%|██████████████████████████████████████████████████▉                                              | 116/221 [14:03<12:28,  7.13s/it]
 53%|███████████████████████████████████████████████████▎                                             | 117/221 [14:10<12:22,  7.14s/it]
                                                                                                                                        
{'loss': 0.0283, 'grad_norm': 0.01797802932560444, 'learning_rate': 5e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.61, 'epoch': 0.53}

 53%|███████████████████████████████████████████████████▎                                             | 117/221 [14:10<12:22,  7.14s/it]
 53%|███████████████████████████████████████████████████▊                                             | 118/221 [14:17<12:14,  7.13s/it]
                                                                                                                                        
{'loss': 0.0259, 'grad_norm': 0.018971417099237442, 'learning_rate': 4.925202964923683e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.12, 'epoch': 0.53}

 53%|███████████████████████████████████████████████████▊                                             | 118/221 [14:17<12:14,  7.13s/it]
 54%|████████████████████████████████████████████████████▏                                            | 119/221 [14:24<12:09,  7.15s/it]
                                                                                                                                        
{'loss': 0.0265, 'grad_norm': 0.019693924114108086, 'learning_rate': 4.850422669153009e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 549.79, 'epoch': 0.54}

 54%|████████████████████████████████████████████████████▏                                            | 119/221 [14:24<12:09,  7.15s/it]
 54%|████████████████████████████████████████████████████▋                                            | 120/221 [14:31<12:02,  7.15s/it]
                                                                                                                                        
{'loss': 0.0277, 'grad_norm': 0.020947441458702087, 'learning_rate': 4.775675848247427e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.17, 'epoch': 0.54}

 54%|████████████████████████████████████████████████████▋                                            | 120/221 [14:31<12:02,  7.15s/it][2025-11-27 00:37:03,937] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-120

 55%|█████████████████████████████████████████████████████                                            | 121/221 [14:51<17:59, 10.80s/it]
                                                                                                                                        
{'loss': 0.0249, 'grad_norm': 0.01684478297829628, 'learning_rate': 4.700979230274829e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.17, 'epoch': 0.55}

 55%|█████████████████████████████████████████████████████                                            | 121/221 [14:51<17:59, 10.80s/it]
 55%|█████████████████████████████████████████████████████▌                                           | 122/221 [14:58<16:01,  9.72s/it]
                                                                                                                                        
{'loss': 0.0289, 'grad_norm': 0.019412320107221603, 'learning_rate': 4.626349532067879e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 450.57, 'epoch': 0.55}

 55%|█████████████████████████████████████████████████████▌                                           | 122/221 [14:58<16:01,  9.72s/it]
 56%|█████████████████████████████████████████████████████▉                                           | 123/221 [15:05<14:31,  8.89s/it]
                                                                                                                                        
{'loss': 0.0246, 'grad_norm': 0.018697615712881088, 'learning_rate': 4.551803455482833e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 310.05, 'epoch': 0.56}

 56%|█████████████████████████████████████████████████████▉                                           | 123/221 [15:05<14:31,  8.89s/it]
 56%|██████████████████████████████████████████████████████▍                                          | 124/221 [15:12<13:31,  8.36s/it]
                                                                                                                                        
{'loss': 0.0245, 'grad_norm': 0.01853892020881176, 'learning_rate': 4.477357683661734e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 448.58, 'epoch': 0.56}

 56%|██████████████████████████████████████████████████████▍                                          | 124/221 [15:12<13:31,  8.36s/it]
 57%|██████████████████████████████████████████████████████▊                                          | 125/221 [15:19<12:44,  7.96s/it]
                                                                                                                                        
{'loss': 0.0257, 'grad_norm': 0.017834430560469627, 'learning_rate': 4.403028877298779e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 379.27, 'epoch': 0.57}

 57%|██████████████████████████████████████████████████████▊                                          | 125/221 [15:19<12:44,  7.96s/it]
 57%|███████████████████████████████████████████████████████▎                                         | 126/221 [15:26<12:16,  7.75s/it]
                                                                                                                                        
{'loss': 0.027, 'grad_norm': 0.018574297428131104, 'learning_rate': 4.328833670911724e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.42, 'epoch': 0.57}

 57%|███████████████████████████████████████████████████████▎                                         | 126/221 [15:26<12:16,  7.75s/it]
 57%|███████████████████████████████████████████████████████▋                                         | 127/221 [15:34<11:51,  7.57s/it]
                                                                                                                                        
{'loss': 0.0333, 'grad_norm': 0.019024794921278954, 'learning_rate': 4.254788669119127e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 458.35, 'epoch': 0.57}

 57%|███████████████████████████████████████████████████████▋                                         | 127/221 [15:34<11:51,  7.57s/it]
 58%|████████████████████████████████████████████████████████▏                                        | 128/221 [15:41<11:30,  7.43s/it]
                                                                                                                                        
{'loss': 0.0247, 'grad_norm': 0.018406303599476814, 'learning_rate': 4.180910442924312e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.39, 'epoch': 0.58}

 58%|████████████████████████████████████████████████████████▏                                        | 128/221 [15:41<11:30,  7.43s/it]
 58%|████████████████████████████████████████████████████████▌                                        | 129/221 [15:48<11:15,  7.34s/it]
                                                                                                                                        
{'loss': 0.0217, 'grad_norm': 0.016275746747851372, 'learning_rate': 4.107215526006817e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 485.26, 'epoch': 0.58}

 58%|████████████████████████████████████████████████████████▌                                        | 129/221 [15:48<11:15,  7.34s/it]
 59%|█████████████████████████████████████████████████████████                                        | 130/221 [15:55<10:59,  7.25s/it]
                                                                                                                                        
{'loss': 0.0284, 'grad_norm': 0.018617160618305206, 'learning_rate': 4.0337204110222346e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.41, 'epoch': 0.59}

 59%|█████████████████████████████████████████████████████████                                        | 130/221 [15:55<10:59,  7.25s/it]
 59%|█████████████████████████████████████████████████████████▍                                       | 131/221 [16:02<10:48,  7.20s/it]
                                                                                                                                        
{'loss': 0.0216, 'grad_norm': 0.01993851736187935, 'learning_rate': 3.960441545911204e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.14, 'epoch': 0.59}

 59%|█████████████████████████████████████████████████████████▍                                       | 131/221 [16:02<10:48,  7.20s/it]
 60%|█████████████████████████████████████████████████████████▉                                       | 132/221 [16:09<10:38,  7.17s/it]
                                                                                                                                        
{'loss': 0.025, 'grad_norm': 0.017185868695378304, 'learning_rate': 3.887395330218429e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 476.36, 'epoch': 0.6}

 60%|█████████████████████████████████████████████████████████▉                                       | 132/221 [16:09<10:38,  7.17s/it]
 60%|██████████████████████████████████████████████████████████▍                                      | 133/221 [16:16<10:25,  7.11s/it]
                                                                                                                                        
{'loss': 0.0264, 'grad_norm': 0.018661336973309517, 'learning_rate': 3.814598111422513e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.55, 'epoch': 0.6}

 60%|██████████████████████████████████████████████████████████▍                                      | 133/221 [16:16<10:25,  7.11s/it]
 61%|██████████████████████████████████████████████████████████▊                                      | 134/221 [16:23<10:18,  7.10s/it]
                                                                                                                                        
{'loss': 0.0274, 'grad_norm': 0.022303791716694832, 'learning_rate': 3.742066181277458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.2, 'epoch': 0.61}

 61%|██████████████████████████████████████████████████████████▊                                      | 134/221 [16:23<10:18,  7.10s/it]
 61%|███████████████████████████████████████████████████████████▎                                     | 135/221 [16:30<10:15,  7.16s/it]
                                                                                                                                        
{'loss': 0.0217, 'grad_norm': 0.0167496707290411, 'learning_rate': 3.6698157721666246e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.61}

 61%|███████████████████████████████████████████████████████████▎                                     | 135/221 [16:30<10:15,  7.16s/it]
 62%|███████████████████████████████████████████████████████████▋                                     | 136/221 [16:37<10:08,  7.16s/it]
                                                                                                                                        
{'loss': 0.0219, 'grad_norm': 0.016045598313212395, 'learning_rate': 3.597863053469987e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 463.19, 'epoch': 0.62}

 62%|███████████████████████████████████████████████████████████▋                                     | 136/221 [16:37<10:08,  7.16s/it]
 62%|████████████████████████████████████████████████████████████▏                                    | 137/221 [16:45<09:59,  7.13s/it]
                                                                                                                                        
{'loss': 0.027, 'grad_norm': 0.017510127276182175, 'learning_rate': 3.5262241279454785e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.64, 'epoch': 0.62}

 62%|████████████████████████████████████████████████████████████▏                                    | 137/221 [16:45<09:59,  7.13s/it]
 62%|████████████████████████████████████████████████████████████▌                                    | 138/221 [16:52<09:54,  7.16s/it]
                                                                                                                                        
{'loss': 0.0308, 'grad_norm': 0.02389226295053959, 'learning_rate': 3.4549150281252636e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.1, 'epoch': 0.62}

 62%|████████████████████████████████████████████████████████████▌                                    | 138/221 [16:52<09:54,  7.16s/it]
 63%|█████████████████████████████████████████████████████████████                                    | 139/221 [16:59<09:50,  7.20s/it]
                                                                                                                                        
{'loss': 0.0275, 'grad_norm': 0.01793692260980606, 'learning_rate': 3.383951712727701e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.85, 'epoch': 0.63}

 63%|█████████████████████████████████████████████████████████████                                    | 139/221 [16:59<09:50,  7.20s/it]
 63%|█████████████████████████████████████████████████████████████▍                                   | 140/221 [17:06<09:39,  7.16s/it]
                                                                                                                                        
{'loss': 0.0294, 'grad_norm': 0.02539198286831379, 'learning_rate': 3.313350063085851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.2, 'epoch': 0.63}

 63%|█████████████████████████████████████████████████████████████▍                                   | 140/221 [17:06<09:39,  7.16s/it]
 64%|█████████████████████████████████████████████████████████████▉                                   | 141/221 [17:13<09:30,  7.13s/it]
                                                                                                                                        
{'loss': 0.0273, 'grad_norm': 0.019213683903217316, 'learning_rate': 3.243125879593286e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.57, 'epoch': 0.64}

 64%|█████████████████████████████████████████████████████████████▉                                   | 141/221 [17:13<09:30,  7.13s/it]
 64%|██████████████████████████████████████████████████████████████▎                                  | 142/221 [17:20<09:21,  7.11s/it]
                                                                                                                                        
{'loss': 0.0357, 'grad_norm': 0.022743066772818565, 'learning_rate': 3.173294878168025e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.85, 'epoch': 0.64}

 64%|██████████████████████████████████████████████████████████████▎                                  | 142/221 [17:20<09:21,  7.11s/it]
 65%|██████████████████████████████████████████████████████████████▊                                  | 143/221 [17:27<09:17,  7.14s/it]
                                                                                                                                        
{'loss': 0.0258, 'grad_norm': 0.018310556188225746, 'learning_rate': 3.103872686735358e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 307.54, 'epoch': 0.65}

 65%|██████████████████████████████████████████████████████████████▊                                  | 143/221 [17:27<09:17,  7.14s/it]
 65%|███████████████████████████████████████████████████████████████▏                                 | 144/221 [17:35<09:08,  7.12s/it]
                                                                                                                                        
{'loss': 0.0265, 'grad_norm': 0.01981915533542633, 'learning_rate': 3.0348748417303823e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 489.51, 'epoch': 0.65}

 65%|███████████████████████████████████████████████████████████████▏                                 | 144/221 [17:35<09:08,  7.12s/it]
 66%|███████████████████████████████████████████████████████████████▋                                 | 145/221 [17:42<08:58,  7.08s/it]
                                                                                                                                        
{'loss': 0.032, 'grad_norm': 0.01881423592567444, 'learning_rate': 2.9663167846209998e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.97, 'epoch': 0.66}

 66%|███████████████████████████████████████████████████████████████▋                                 | 145/221 [17:42<08:58,  7.08s/it]
 66%|████████████████████████████████████████████████████████████████                                 | 146/221 [17:49<08:52,  7.10s/it]
                                                                                                                                        
{'loss': 0.029, 'grad_norm': 0.017727544531226158, 'learning_rate': 2.8982138584521735e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 475.56, 'epoch': 0.66}

 66%|████████████████████████████████████████████████████████████████                                 | 146/221 [17:49<08:52,  7.10s/it]
 67%|████████████████████████████████████████████████████████████████▌                                | 147/221 [17:56<08:45,  7.10s/it]
                                                                                                                                        
{'loss': 0.0292, 'grad_norm': 0.01951543428003788, 'learning_rate': 2.8305813044122097e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 376.39, 'epoch': 0.67}

 67%|████████████████████████████████████████████████████████████████▌                                | 147/221 [17:56<08:45,  7.10s/it]
 67%|████████████████████████████████████████████████████████████████▉                                | 148/221 [18:03<08:37,  7.09s/it]
                                                                                                                                        
{'loss': 0.0272, 'grad_norm': 0.018047522753477097, 'learning_rate': 2.7634342584218365e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 514.29, 'epoch': 0.67}

 67%|████████████████████████████████████████████████████████████████▉                                | 148/221 [18:03<08:37,  7.09s/it]
 67%|█████████████████████████████████████████████████████████████████▍                               | 149/221 [18:10<08:35,  7.16s/it]
                                                                                                                                        
{'loss': 0.0295, 'grad_norm': 0.020815616473555565, 'learning_rate': 2.6967877477468397e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.84, 'epoch': 0.67}

 67%|█████████████████████████████████████████████████████████████████▍                               | 149/221 [18:10<08:35,  7.16s/it]
 68%|█████████████████████████████████████████████████████████████████▊                               | 150/221 [18:17<08:26,  7.13s/it]
                                                                                                                                        
{'loss': 0.0303, 'grad_norm': 0.02059975638985634, 'learning_rate': 2.630656687635007e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.18, 'epoch': 0.68}

 68%|█████████████████████████████████████████████████████████████████▊                               | 150/221 [18:17<08:26,  7.13s/it]
 68%|██████████████████████████████████████████████████████████████████▎                              | 151/221 [18:24<08:19,  7.13s/it]
                                                                                                                                        
{'loss': 0.0297, 'grad_norm': 0.019998600706458092, 'learning_rate': 2.5650558779781635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.86, 'epoch': 0.68}

 68%|██████████████████████████████████████████████████████████████████▎                              | 151/221 [18:24<08:19,  7.13s/it]
 69%|██████████████████████████████████████████████████████████████████▋                              | 152/221 [18:32<08:15,  7.19s/it]
                                                                                                                                        
{'loss': 0.033, 'grad_norm': 0.022024452686309814, 'learning_rate': 2.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.49, 'epoch': 0.69}

 69%|██████████████████████████████████████████████████████████████████▋                              | 152/221 [18:32<08:15,  7.19s/it]
 69%|███████████████████████████████████████████████████████████████████▏                             | 153/221 [18:39<08:13,  7.26s/it]
                                                                                                                                        
{'loss': 0.0278, 'grad_norm': 0.019334938377141953, 'learning_rate': 2.43550361297047e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.99, 'epoch': 0.69}

 69%|███████████████████████████████████████████████████████████████████▏                             | 153/221 [18:39<08:13,  7.26s/it]
 70%|███████████████████████████████████████████████████████████████████▌                             | 154/221 [18:46<08:04,  7.23s/it]
                                                                                                                                        
{'loss': 0.0294, 'grad_norm': 0.02994287945330143, 'learning_rate': 2.371581150947476e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.31, 'epoch': 0.7}

 70%|███████████████████████████████████████████████████████████████████▌                             | 154/221 [18:46<08:04,  7.23s/it]
 70%|████████████████████████████████████████████████████████████████████                             | 155/221 [18:53<07:54,  7.19s/it]
                                                                                                                                        
{'loss': 0.0228, 'grad_norm': 0.021970966830849648, 'learning_rate': 2.3082469195465893e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.03, 'epoch': 0.7}

 70%|████████████████████████████████████████████████████████████████████                             | 155/221 [18:53<07:54,  7.19s/it]
 71%|████████████████████████████████████████████████████████████████████▍                            | 156/221 [19:00<07:44,  7.15s/it]
                                                                                                                                        
{'loss': 0.0225, 'grad_norm': 0.017728326842188835, 'learning_rate': 2.245515092739488e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 474.23, 'epoch': 0.71}

 71%|████████████████████████████████████████████████████████████████████▍                            | 156/221 [19:00<07:44,  7.15s/it]
 71%|████████████████████████████████████████████████████████████████████▉                            | 157/221 [19:08<07:38,  7.17s/it]
                                                                                                                                        
{'loss': 0.0206, 'grad_norm': 0.016926869750022888, 'learning_rate': 2.1833997096818898e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.82, 'epoch': 0.71}

 71%|████████████████████████████████████████████████████████████████████▉                            | 157/221 [19:08<07:38,  7.17s/it]
 71%|█████████████████████████████████████████████████████████████████████▎                           | 158/221 [19:15<07:29,  7.14s/it]
                                                                                                                                        
{'loss': 0.0281, 'grad_norm': 0.029541337862610817, 'learning_rate': 2.1219146715716332e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.89, 'epoch': 0.71}

 71%|█████████████████████████████████████████████████████████████████████▎                           | 158/221 [19:15<07:29,  7.14s/it]
 72%|█████████████████████████████████████████████████████████████████████▊                           | 159/221 [19:22<07:23,  7.15s/it]
                                                                                                                                        
{'loss': 0.0234, 'grad_norm': 0.01689094677567482, 'learning_rate': 2.061073738537635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 387.99, 'epoch': 0.72}

 72%|█████████████████████████████████████████████████████████████████████▊                           | 159/221 [19:22<07:23,  7.15s/it]
 72%|██████████████████████████████████████████████████████████████████████▏                          | 160/221 [19:29<07:14,  7.13s/it]
                                                                                                                                        
{'loss': 0.033, 'grad_norm': 0.01850043050944805, 'learning_rate': 2.0008905265604316e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.06, 'epoch': 0.72}

 72%|██████████████████████████████████████████████████████████████████████▏                          | 160/221 [19:29<07:14,  7.13s/it]
 73%|██████████████████████████████████████████████████████████████████████▋                          | 161/221 [19:36<07:07,  7.12s/it]
                                                                                                                                        
{'loss': 0.033, 'grad_norm': 0.020465202629566193, 'learning_rate': 1.9413785044249678e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.54, 'epoch': 0.73}

 73%|██████████████████████████████████████████████████████████████████████▋                          | 161/221 [19:36<07:07,  7.12s/it]
 73%|███████████████████████████████████████████████████████████████████████                          | 162/221 [19:43<07:00,  7.12s/it]
                                                                                                                                        
{'loss': 0.0302, 'grad_norm': 0.019559573382139206, 'learning_rate': 1.8825509907063327e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 482.08, 'epoch': 0.73}

 73%|███████████████████████████████████████████████████████████████████████                          | 162/221 [19:43<07:00,  7.12s/it]
 74%|███████████████████████████████████████████████████████████████████████▌                         | 163/221 [19:50<06:51,  7.10s/it]
                                                                                                                                        
{'loss': 0.0202, 'grad_norm': 0.016423381865024567, 'learning_rate': 1.8244211507891063e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 425.58, 'epoch': 0.74}

 74%|███████████████████████████████████████████████████████████████████████▌                         | 163/221 [19:50<06:51,  7.10s/it]
 74%|███████████████████████████████████████████████████████████████████████▉                         | 164/221 [19:57<06:44,  7.10s/it]
                                                                                                                                        
{'loss': 0.0257, 'grad_norm': 0.01980419084429741, 'learning_rate': 1.7670019939210024e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.46, 'epoch': 0.74}

 74%|███████████████████████████████████████████████████████████████████████▉                         | 164/221 [19:57<06:44,  7.10s/it]
 75%|████████████████████████████████████████████████████████████████████████▍                        | 165/221 [20:05<06:39,  7.13s/it]
                                                                                                                                        
{'loss': 0.025, 'grad_norm': 0.021348468959331512, 'learning_rate': 1.7103063703014372e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 498.15, 'epoch': 0.75}

 75%|████████████████████████████████████████████████████████████████████████▍                        | 165/221 [20:05<06:39,  7.13s/it]
 75%|████████████████████████████████████████████████████████████████████████▊                        | 166/221 [20:12<06:29,  7.09s/it]
                                                                                                                                        
{'loss': 0.026, 'grad_norm': 0.01638958230614662, 'learning_rate': 1.6543469682057106e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.25, 'epoch': 0.75}

 75%|████████████████████████████████████████████████████████████████████████▊                        | 166/221 [20:12<06:29,  7.09s/it]
 76%|█████████████████████████████████████████████████████████████████████████▎                       | 167/221 [20:19<06:23,  7.10s/it]
                                                                                                                                        
{'loss': 0.0306, 'grad_norm': 0.02299441583454609, 'learning_rate': 1.599136311145402e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.47, 'epoch': 0.76}

 76%|█████████████████████████████████████████████████████████████████████████▎                       | 167/221 [20:19<06:23,  7.10s/it]
 76%|█████████████████████████████████████████████████████████████████████████▋                       | 168/221 [20:26<06:15,  7.09s/it]
                                                                                                                                        
{'loss': 0.025, 'grad_norm': 0.017598189413547516, 'learning_rate': 1.544686755065677e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.71, 'epoch': 0.76}

 76%|█████████████████████████████████████████████████████████████████████████▋                       | 168/221 [20:26<06:15,  7.09s/it]
 76%|██████████████████████████████████████████████████████████████████████████▏                      | 169/221 [20:33<06:10,  7.13s/it]
                                                                                                                                        
{'loss': 0.0236, 'grad_norm': 0.01685059629380703, 'learning_rate': 1.4910104855800427e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.43, 'epoch': 0.76}

 76%|██████████████████████████████████████████████████████████████████████████▏                      | 169/221 [20:33<06:10,  7.13s/it]
 77%|██████████████████████████████████████████████████████████████████████████▌                      | 170/221 [20:40<06:03,  7.13s/it]
                                                                                                                                        
{'loss': 0.0272, 'grad_norm': 0.018304958939552307, 'learning_rate': 1.438119515243277e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 451.61, 'epoch': 0.77}

 77%|██████████████████████████████████████████████████████████████████████████▌                      | 170/221 [20:40<06:03,  7.13s/it]
 77%|███████████████████████████████████████████████████████████████████████████                      | 171/221 [20:47<05:56,  7.12s/it]
                                                                                                                                        
{'loss': 0.022, 'grad_norm': 0.018485043197870255, 'learning_rate': 1.3860256808630428e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.29, 'epoch': 0.77}

 77%|███████████████████████████████████████████████████████████████████████████                      | 171/221 [20:47<05:56,  7.12s/it]
 78%|███████████████████████████████████████████████████████████████████████████▍                     | 172/221 [20:54<05:50,  7.15s/it]
                                                                                                                                        
{'loss': 0.0244, 'grad_norm': 0.018180640414357185, 'learning_rate': 1.3347406408508695e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 401.39, 'epoch': 0.78}

 78%|███████████████████████████████████████████████████████████████████████████▍                     | 172/221 [20:54<05:50,  7.15s/it]
 78%|███████████████████████████████████████████████████████████████████████████▉                     | 173/221 [21:01<05:42,  7.13s/it]
                                                                                                                                        
{'loss': 0.0284, 'grad_norm': 0.018684981390833855, 'learning_rate': 1.2842758726130283e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.88, 'epoch': 0.78}

 78%|███████████████████████████████████████████████████████████████████████████▉                     | 173/221 [21:01<05:42,  7.13s/it]
 79%|████████████████████████████████████████████████████████████████████████████▎                    | 174/221 [21:09<05:36,  7.16s/it]
                                                                                                                                        
{'loss': 0.0289, 'grad_norm': 0.021512368693947792, 'learning_rate': 1.2346426699819458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 383.13, 'epoch': 0.79}

 79%|████████████████████████████████████████████████████████████████████████████▎                    | 174/221 [21:09<05:36,  7.16s/it]
 79%|████████████████████████████████████████████████████████████████████████████▊                    | 175/221 [21:16<05:29,  7.15s/it]
                                                                                                                                        
{'loss': 0.0279, 'grad_norm': 0.023360926657915115, 'learning_rate': 1.1858521406886675e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.55, 'epoch': 0.79}

 79%|████████████████████████████████████████████████████████████████████████████▊                    | 175/221 [21:16<05:29,  7.15s/it]
 80%|█████████████████████████████████████████████████████████████████████████████▏                   | 176/221 [21:23<05:21,  7.14s/it]
                                                                                                                                        
{'loss': 0.0279, 'grad_norm': 0.022415969520807266, 'learning_rate': 1.137915203877003e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.44, 'epoch': 0.8}

 80%|█████████████████████████████████████████████████████████████████████████████▏                   | 176/221 [21:23<05:21,  7.14s/it]
 80%|█████████████████████████████████████████████████████████████████████████████▋                   | 177/221 [21:30<05:14,  7.16s/it]
                                                                                                                                        
{'loss': 0.0256, 'grad_norm': 0.01842794381082058, 'learning_rate': 1.090842587659851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.33, 'epoch': 0.8}

 80%|█████████████████████████████████████████████████████████████████████████████▋                   | 177/221 [21:30<05:14,  7.16s/it]
 81%|██████████████████████████████████████████████████████████████████████████████▏                  | 178/221 [21:37<05:07,  7.15s/it]
                                                                                                                                        
{'loss': 0.0282, 'grad_norm': 0.021043118089437485, 'learning_rate': 1.0446448267182952e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 398.08, 'epoch': 0.81}

 81%|██████████████████████████████████████████████████████████████████████████████▏                  | 178/221 [21:37<05:07,  7.15s/it]
 81%|██████████████████████████████████████████████████████████████████████████████▌                  | 179/221 [21:44<05:01,  7.17s/it]
                                                                                                                                        
{'loss': 0.0266, 'grad_norm': 0.020777752622961998, 'learning_rate': 9.993322599439692e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.9, 'epoch': 0.81}

 81%|██████████████████████████████████████████████████████████████████████████████▌                  | 179/221 [21:44<05:01,  7.17s/it]
 81%|███████████████████████████████████████████████████████████████████████████████                  | 180/221 [21:52<04:52,  7.14s/it]
                                                                                                                                        
{'loss': 0.0365, 'grad_norm': 0.026739781722426414, 'learning_rate': 9.549150281252633e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.35, 'epoch': 0.81}

 81%|███████████████████████████████████████████████████████████████████████████████                  | 180/221 [21:52<04:52,  7.14s/it][2025-11-27 00:44:23,881] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-180

 82%|███████████████████████████████████████████████████████████████████████████████▍                 | 181/221 [22:11<07:11, 10.79s/it]
                                                                                                                                        
{'loss': 0.0358, 'grad_norm': 0.01902214251458645, 'learning_rate': 9.114030716778433e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.08, 'epoch': 0.82}

 82%|███████████████████████████████████████████████████████████████████████████████▍                 | 181/221 [22:11<07:11, 10.79s/it]
 82%|███████████████████████████████████████████████████████████████████████████████▉                 | 182/221 [22:18<06:19,  9.74s/it]
                                                                                                                                        
{'loss': 0.0249, 'grad_norm': 0.019700270146131516, 'learning_rate': 8.688061284200266e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.96, 'epoch': 0.82}

 82%|███████████████████████████████████████████████████████████████████████████████▉                 | 182/221 [22:18<06:19,  9.74s/it]
 83%|████████████████████████████████████████████████████████████████████████████████▎                | 183/221 [22:25<05:40,  8.95s/it]
                                                                                                                                        
{'loss': 0.0259, 'grad_norm': 0.021064477041363716, 'learning_rate': 8.271337313934869e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.67, 'epoch': 0.83}

 83%|████████████████████████████████████████████████████████████████████████████████▎                | 183/221 [22:25<05:40,  8.95s/it]
 83%|████████████████████████████████████████████████████████████████████████████████▊                | 184/221 [22:33<05:13,  8.47s/it]
                                                                                                                                        
{'loss': 0.0291, 'grad_norm': 0.01918896846473217, 'learning_rate': 7.863952067298042e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 285.99, 'epoch': 0.83}

 83%|████████████████████████████████████████████████████████████████████████████████▊                | 184/221 [22:33<05:13,  8.47s/it]
 84%|█████████████████████████████████████████████████████████████████████████████████▏               | 185/221 [22:40<04:50,  8.08s/it]
                                                                                                                                        
{'loss': 0.0298, 'grad_norm': 0.020457495003938675, 'learning_rate': 7.465996715633028e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 322.48, 'epoch': 0.84}

 84%|█████████████████████████████████████████████████████████████████████████████████▏               | 185/221 [22:40<04:50,  8.08s/it]
 84%|█████████████████████████████████████████████████████████████████████████████████▋               | 186/221 [22:47<04:32,  7.77s/it]
                                                                                                                                        
{'loss': 0.0276, 'grad_norm': 0.019534621387720108, 'learning_rate': 7.077560319906695e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 527.31, 'epoch': 0.84}

 84%|█████████████████████████████████████████████████████████████████████████████████▋               | 186/221 [22:47<04:32,  7.77s/it]
 85%|██████████████████████████████████████████████████████████████████████████████████               | 187/221 [22:54<04:16,  7.55s/it]
                                                                                                                                        
{'loss': 0.0284, 'grad_norm': 0.019534330815076828, 'learning_rate': 6.698729810778065e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.99, 'epoch': 0.85}

 85%|██████████████████████████████████████████████████████████████████████████████████               | 187/221 [22:54<04:16,  7.55s/it]
 85%|██████████████████████████████████████████████████████████████████████████████████▌              | 188/221 [23:01<04:04,  7.42s/it]
                                                                                                                                        
{'loss': 0.0306, 'grad_norm': 0.032107334583997726, 'learning_rate': 6.329589969143518e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.76, 'epoch': 0.85}

 85%|██████████████████████████████████████████████████████████████████████████████████▌              | 188/221 [23:01<04:04,  7.42s/it]
 86%|██████████████████████████████████████████████████████████████████████████████████▉              | 189/221 [23:08<03:56,  7.38s/it]
                                                                                                                                        
{'loss': 0.0412, 'grad_norm': 0.019100667908787727, 'learning_rate': 5.9702234071631e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.21, 'epoch': 0.86}

 86%|██████████████████████████████████████████████████████████████████████████████████▉              | 189/221 [23:08<03:56,  7.38s/it]
 86%|███████████████████████████████████████████████████████████████████████████████████▍             | 190/221 [23:15<03:46,  7.31s/it]
                                                                                                                                        
{'loss': 0.027, 'grad_norm': 0.01997012086212635, 'learning_rate': 5.620710549772295e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 335.96, 'epoch': 0.86}

 86%|███████████████████████████████████████████████████████████████████████████████████▍             | 190/221 [23:15<03:46,  7.31s/it]
 86%|███████████████████████████████████████████████████████████████████████████████████▊             | 191/221 [23:22<03:37,  7.25s/it]
                                                                                                                                        
{'loss': 0.0248, 'grad_norm': 0.01957276090979576, 'learning_rate': 5.281129616683167e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 355.72, 'epoch': 0.86}

 86%|███████████████████████████████████████████████████████████████████████████████████▊             | 191/221 [23:23<03:37,  7.25s/it]
 87%|████████████████████████████████████████████████████████████████████████████████████▎            | 192/221 [23:30<03:29,  7.22s/it]
                                                                                                                                        
{'loss': 0.0266, 'grad_norm': 0.019423488527536392, 'learning_rate': 4.951556604879048e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.6, 'epoch': 0.87}

 87%|████████████████████████████████████████████████████████████████████████████████████▎            | 192/221 [23:30<03:29,  7.22s/it]
 87%|████████████████████████████████████████████████████████████████████████████████████▋            | 193/221 [23:37<03:22,  7.25s/it]
                                                                                                                                        
{'loss': 0.0246, 'grad_norm': 0.021110599860548973, 'learning_rate': 4.632065271606756e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.45, 'epoch': 0.87}

 87%|████████████████████████████████████████████████████████████████████████████████████▋            | 193/221 [23:37<03:22,  7.25s/it]
 88%|█████████████████████████████████████████████████████████████████████████████████████▏           | 194/221 [23:44<03:14,  7.21s/it]
                                                                                                                                        
{'loss': 0.026, 'grad_norm': 0.02191292867064476, 'learning_rate': 4.322727117869951e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 404.69, 'epoch': 0.88}

 88%|█████████████████████████████████████████████████████████████████████████████████████▏           | 194/221 [23:44<03:14,  7.21s/it]
 88%|█████████████████████████████████████████████████████████████████████████████████████▌           | 195/221 [23:51<03:05,  7.15s/it]
                                                                                                                                        
{'loss': 0.0277, 'grad_norm': 0.018202103674411774, 'learning_rate': 4.023611372427471e-06, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 501.69, 'epoch': 0.88}

 88%|█████████████████████████████████████████████████████████████████████████████████████▌           | 195/221 [23:51<03:05,  7.15s/it]
 89%|██████████████████████████████████████████████████████████████████████████████████████           | 196/221 [23:58<02:58,  7.13s/it]
                                                                                                                                        
{'loss': 0.0203, 'grad_norm': 0.016622671857476234, 'learning_rate': 3.734784976300165e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.02, 'epoch': 0.89}

 89%|██████████████████████████████████████████████████████████████████████████████████████           | 196/221 [23:58<02:58,  7.13s/it]
 89%|██████████████████████████████████████████████████████████████████████████████████████▍          | 197/221 [24:05<02:51,  7.16s/it]
                                                                                                                                        
{'loss': 0.0281, 'grad_norm': 0.019572410732507706, 'learning_rate': 3.4563125677897932e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.87, 'epoch': 0.89}

 89%|██████████████████████████████████████████████████████████████████████████████████████▍          | 197/221 [24:05<02:51,  7.16s/it]
 90%|██████████████████████████████████████████████████████████████████████████████████████▉          | 198/221 [24:13<02:44,  7.17s/it]
                                                                                                                                        
{'loss': 0.0272, 'grad_norm': 0.02384166046977043, 'learning_rate': 3.18825646801314e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.89, 'epoch': 0.9}

 90%|██████████████████████████████████████████████████████████████████████████████████████▉          | 198/221 [24:13<02:44,  7.17s/it]
 90%|███████████████████████████████████████████████████████████████████████████████████████▎         | 199/221 [24:20<02:37,  7.14s/it]
                                                                                                                                        
{'loss': 0.0291, 'grad_norm': 0.022356677800416946, 'learning_rate': 2.930676666954846e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.48, 'epoch': 0.9}

 90%|███████████████████████████████████████████████████████████████████████████████████████▎         | 199/221 [24:20<02:37,  7.14s/it]
 90%|███████████████████████████████████████████████████████████████████████████████████████▊         | 200/221 [24:27<02:30,  7.15s/it]
                                                                                                                                        
{'loss': 0.0354, 'grad_norm': 0.024130800738930702, 'learning_rate': 2.6836308100417873e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.26, 'epoch': 0.9}

 90%|███████████████████████████████████████████████████████████████████████████████████████▊         | 200/221 [24:27<02:30,  7.15s/it]
 91%|████████████████████████████████████████████████████████████████████████████████████████▏        | 201/221 [24:34<02:23,  7.17s/it]
                                                                                                                                        
{'loss': 0.0267, 'grad_norm': 0.021222814917564392, 'learning_rate': 2.4471741852423237e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 494.07, 'epoch': 0.91}

 91%|████████████████████████████████████████████████████████████████████████████████████████▏        | 201/221 [24:34<02:23,  7.17s/it]
 91%|████████████████████████████████████████████████████████████████████████████████████████▋        | 202/221 [24:41<02:15,  7.16s/it]
                                                                                                                                        
{'loss': 0.0307, 'grad_norm': 0.01970573328435421, 'learning_rate': 2.221359710692961e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.36, 'epoch': 0.91}

 91%|████████████████████████████████████████████████████████████████████████████████████████▋        | 202/221 [24:41<02:15,  7.16s/it]
 92%|█████████████████████████████████████████████████████████████████████████████████████████        | 203/221 [24:48<02:08,  7.13s/it]
                                                                                                                                        
{'loss': 0.0234, 'grad_norm': 0.01854623667895794, 'learning_rate': 2.006237922855553e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 394.22, 'epoch': 0.92}

 92%|█████████████████████████████████████████████████████████████████████████████████████████        | 203/221 [24:48<02:08,  7.13s/it]
 92%|█████████████████████████████████████████████████████████████████████████████████████████▌       | 204/221 [24:55<02:01,  7.15s/it]
                                                                                                                                        
{'loss': 0.0241, 'grad_norm': 0.0231945738196373, 'learning_rate': 1.8018569652073381e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.08, 'epoch': 0.92}

 92%|█████████████████████████████████████████████████████████████████████████████████████████▌       | 204/221 [24:55<02:01,  7.15s/it]
 93%|█████████████████████████████████████████████████████████████████████████████████████████▉       | 205/221 [25:03<01:54,  7.13s/it]
                                                                                                                                        
{'loss': 0.0264, 'grad_norm': 0.018703971058130264, 'learning_rate': 1.6082625774666794e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 471.75, 'epoch': 0.93}

 93%|█████████████████████████████████████████████████████████████████████████████████████████▉       | 205/221 [25:03<01:54,  7.13s/it]
 93%|██████████████████████████████████████████████████████████████████████████████████████████▍      | 206/221 [25:10<01:47,  7.14s/it]
                                                                                                                                        
{'loss': 0.0251, 'grad_norm': 0.017928369343280792, 'learning_rate': 1.4254980853566247e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.2, 'epoch': 0.93}

 93%|██████████████████████████████████████████████████████████████████████████████████████████▍      | 206/221 [25:10<01:47,  7.14s/it]
 94%|██████████████████████████████████████████████████████████████████████████████████████████▊      | 207/221 [25:17<01:39,  7.13s/it]
                                                                                                                                        
{'loss': 0.0239, 'grad_norm': 0.018282128497958183, 'learning_rate': 1.2536043909088191e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.9, 'epoch': 0.94}

 94%|██████████████████████████████████████████████████████████████████████████████████████████▊      | 207/221 [25:17<01:39,  7.13s/it]
 94%|███████████████████████████████████████████████████████████████████████████████████████████▎     | 208/221 [25:24<01:32,  7.12s/it]
                                                                                                                                        
{'loss': 0.0307, 'grad_norm': 0.021411525085568428, 'learning_rate': 1.0926199633097157e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 382.68, 'epoch': 0.94}

 94%|███████████████████████████████████████████████████████████████████████████████████████████▎     | 208/221 [25:24<01:32,  7.12s/it]
 95%|███████████████████████████████████████████████████████████████████████████████████████████▋     | 209/221 [25:31<01:25,  7.17s/it]
                                                                                                                                        
{'loss': 0.0287, 'grad_norm': 0.02065850794315338, 'learning_rate': 9.42580830291373e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 431.06, 'epoch': 0.95}

 95%|███████████████████████████████████████████████████████████████████████████████████████████▋     | 209/221 [25:31<01:25,  7.17s/it]
 95%|████████████████████████████████████████████████████████████████████████████████████████████▏    | 210/221 [25:39<01:19,  7.21s/it]
                                                                                                                                        
{'loss': 0.0212, 'grad_norm': 0.019915733486413956, 'learning_rate': 8.035205700685167e-07, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 418.82, 'epoch': 0.95}

 95%|████████████████████████████████████████████████████████████████████████████████████████████▏    | 210/221 [25:39<01:19,  7.21s/it]
 95%|████████████████████████████████████████████████████████████████████████████████████████████▌    | 211/221 [25:46<01:11,  7.18s/it]
                                                                                                                                        
{'loss': 0.0264, 'grad_norm': 0.020451124757528305, 'learning_rate': 6.75470303823933e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.8, 'epoch': 0.95}

 95%|████████████████████████████████████████████████████████████████████████████████████████████▌    | 211/221 [25:46<01:11,  7.18s/it]
 96%|█████████████████████████████████████████████████████████████████████████████████████████████    | 212/221 [25:53<01:04,  7.17s/it]
                                                                                                                                        
{'loss': 0.0268, 'grad_norm': 0.02076980657875538, 'learning_rate': 5.584586887435739e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.6, 'epoch': 0.96}

 96%|█████████████████████████████████████████████████████████████████████████████████████████████    | 212/221 [25:53<01:04,  7.17s/it]
 96%|█████████████████████████████████████████████████████████████████████████████████████████████▍   | 213/221 [26:00<00:57,  7.14s/it]
                                                                                                                                        
{'loss': 0.0246, 'grad_norm': 0.019138546660542488, 'learning_rate': 4.52511911603265e-07, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 364.67, 'epoch': 0.96}

 96%|█████████████████████████████████████████████████████████████████████████████████████████████▍   | 213/221 [26:00<00:57,  7.14s/it]
 97%|█████████████████████████████████████████████████████████████████████████████████████████████▉   | 214/221 [26:07<00:49,  7.11s/it]
                                                                                                                                        
{'loss': 0.0348, 'grad_norm': 0.026033613830804825, 'learning_rate': 3.576536829081323e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 505.19, 'epoch': 0.97}

 97%|█████████████████████████████████████████████████████████████████████████████████████████████▉   | 214/221 [26:07<00:49,  7.11s/it]
 97%|██████████████████████████████████████████████████████████████████████████████████████████████▎  | 215/221 [26:14<00:42,  7.11s/it]
                                                                                                                                        
{'loss': 0.0308, 'grad_norm': 0.01909700781106949, 'learning_rate': 2.7390523158633554e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 402.37, 'epoch': 0.97}

 97%|██████████████████████████████████████████████████████████████████████████████████████████████▎  | 215/221 [26:14<00:42,  7.11s/it]
 98%|██████████████████████████████████████████████████████████████████████████████████████████████▊  | 216/221 [26:21<00:35,  7.17s/it]
                                                                                                                                        
{'loss': 0.0251, 'grad_norm': 0.017447378486394882, 'learning_rate': 2.012853002380466e-07, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.02, 'epoch': 0.98}

 98%|██████████████████████████████████████████████████████████████████████████████████████████████▊  | 216/221 [26:21<00:35,  7.17s/it]
 98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28,  7.16s/it]
                                                                                                                                        
{'loss': 0.0294, 'grad_norm': 0.0193310659378767, 'learning_rate': 1.3981014094099353e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 392.76, 'epoch': 0.98}

 98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28,  7.16s/it]
 99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21,  7.17s/it]
                                                                                                                                        
{'loss': 0.0286, 'grad_norm': 0.023237884044647217, 'learning_rate': 8.949351161324227e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 444.9, 'epoch': 0.99}

 99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21,  7.17s/it]
 99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14,  7.15s/it]
                                                                                                                                        
{'loss': 0.0264, 'grad_norm': 0.017193371430039406, 'learning_rate': 5.0346672934270534e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.84, 'epoch': 0.99}

 99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14,  7.15s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07,  7.13s/it]
                                                                                                                                        
{'loss': 0.0311, 'grad_norm': 0.01859343983232975, 'learning_rate': 2.237838582483387e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 512.81, 'epoch': 1.0}

100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07,  7.13s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00,  7.17s/it]
                                                                                                                                        
{'loss': 0.0256, 'grad_norm': 0.021252349019050598, 'learning_rate': 5.594909486328348e-09, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.96, 'epoch': 1.0}

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00,  7.17s/it][2025-11-27 00:49:29,268] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-221

                                                                                                                                        
{'train_runtime': 1631.9607, 'train_samples_per_second': 17.334, 'train_steps_per_second': 0.135, 'train_loss': 0.030637798653873383, 'memory/max_active (GiB)': 15.75, 'memory/max_allocated (GiB)': 15.75, 'memory/device_reserved (GiB)': 50.97, 'epoch': 1.0}

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00,  7.17s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00,  7.37s/it]
[2025-11-27 00:49:33,160] [INFO] [axolotl.train.save_trained_model:218] [PID:80269] Training completed! Saving trained model to ./nov262025-sc-LoRA-Run.
[2025-11-27 00:49:33,820] [INFO] [axolotl.train.save_trained_model:336] [PID:80269] Model successfully saved to ./nov262025-sc-LoRA-Run