File size: 208,368 Bytes
9604973 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 |
[2025-11-27 00:21:02,496] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:80269] baseline 0.000GB ()
[2025-11-27 00:21:02,496] [INFO] [axolotl.cli.config.load_cfg:248] [PID:80269] config:
{
"activation_offloading": false,
"adapter": "lora",
"axolotl_config_path": "seedcoder.yaml",
"base_model": "ByteDance-Seed/Seed-Coder-8B-Instruct",
"base_model_config": "ByteDance-Seed/Seed-Coder-8B-Instruct",
"batch_size": 128,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": false,
"n_gpu": 8,
"n_node": 1
},
"context_parallel_size": 1,
"dataloader_num_workers": 8,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 208,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "new_data_clean.jsonl",
"roles": {
"assistant": [
"assistant"
],
"system": [
"system"
],
"user": [
"user"
]
},
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": true,
"deepspeed": {
"bf16": {
"enabled": "auto"
},
"fp16": {
"auto_cast": false,
"enabled": "auto",
"hysteresis": 2,
"initial_scale_power": 32,
"loss_scale": 0,
"loss_scale_window": 1000,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"zero_optimization": {
"contiguous_gradients": true,
"offload_optimizer": {
"device": "cpu"
},
"overlap_comm": true,
"stage": 2
}
},
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.8.0"
},
"eval_batch_size": 16,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"flash_attention": true,
"fp16": false,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"group_by_length": false,
"include_tkps": true,
"is_falcon_derived_model": false,
"is_llama_derived_model": true,
"is_mistral_derived_model": false,
"learning_rate": 0.0001,
"liger_fused_linear_cross_entropy": true,
"liger_glu_activation": true,
"liger_layer_norm": true,
"liger_rms_norm": true,
"liger_rope": true,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_r": 64,
"lora_target_linear": true,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"micro_batch_size": 16,
"model_config_type": "llama",
"num_epochs": 1.0,
"optimizer": "adamw_torch",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./nov262025-sc-LoRA-Run",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.liger.LigerPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": false,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 60,
"save_total_limit": 100,
"sequence_len": 4096,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": false,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "ByteDance-Seed/Seed-Coder-8B-Instruct",
"tokenizer_save_jinja_files": true,
"tokenizer_type": "AutoTokenizer",
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"trust_remote_code": true,
"type_of_model": "AutoModelForCausalLM",
"use_otel_metrics": false,
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"wandb_entity": "test-aa",
"wandb_name": "nov-26-sc-lor-run-1",
"wandb_project": "seedcoder",
"warmup_ratio": 0.05,
"weight_decay": 0.0,
"world_size": 8
}
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]>
[2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None
[2025-11-27 00:21:41,317] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:80269] Loading prepared dataset from disk at last_run_prepared/683f1b6addffef1a6c101561a46fc077...
Loading dataset from disk: 0%| | 0/110 [00:00<?, ?it/s]
Loading dataset from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 82772.41it/s]
[2025-11-27 00:21:41,594] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:80269] total_num_tokens: 75_959_959
[2025-11-27 00:21:42,244] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:80269] `total_supervised_tokens: 5_309_191`
[2025-11-27 00:21:42,245] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:80269] total_num_steps: 221
[2025-11-27 00:21:42,245] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:80269] Maximum number of steps set at 221
[2025-11-27 00:21:42,270] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:80269] Loading tokenizer... ByteDance-Seed/Seed-Coder-8B-Instruct
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]>
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None
[2025-11-27 00:21:42,832] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:80269] Loading model
[2025-11-27 00:21:42,935] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:80269] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-11-27 00:21:42,937] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:80269] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-11-27 00:21:42,955] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:80269] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 25%|████████████████████████████████ | 1/4 [00:01<00:03, 1.31s/it]
Loading checkpoint shards: 50%|████████████████████████████████████████████████████████████████ | 2/4 [00:02<00:02, 1.27s/it]
Loading checkpoint shards: 75%|████████████████████████████████████████████████████████████████████████████████████████████████ | 3/4 [00:03<00:01, 1.28s/it]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.07it/s]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.06s/it]
[2025-11-27 00:22:10,720] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:80269] Converting modules to torch.bfloat16
[2025-11-27 00:22:10,723] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] Memory usage after model load 18.938GB (+18.938GB allocated, +20.139GB reserved)
[2025-11-27 00:22:10,724] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:80269] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 167,772,160 || all params: 8,418,234,368 || trainable%: 1.9930
[2025-11-27 00:22:12,070] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] after adapters 16.002GB (+16.002GB allocated, +20.436GB reserved)
[2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:398] [PID:80269] Pre-saving adapter config to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:402] [PID:80269] Pre-saving tokenizer to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,712] [INFO] [axolotl.train.save_initial_configs:407] [PID:80269] Pre-saving model config to ./nov262025-sc-LoRA-Run...
[2025-11-27 00:22:13,716] [INFO] [axolotl.train.execute_training:196] [PID:80269] Starting trainer...
Time to load cpu_adam op: 2.386819839477539 seconds
[34m[1mwandb[0m: Currently logged in as: [33mpandyamarut[0m ([33mtest-aa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K
[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[Am[2K
[34m[1mwandb[0m: Tracking run with wandb version 0.22.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/osmosis/wandb/run-20251127_002220-5un64tuw[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mnov-26-sc-lor-run-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/test-aa/seedcoder[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/test-aa/seedcoder/runs/5un64tuw[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-11-27 00:22:21,580] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:80269] The Axolotl config has been saved to the WandB run under files.
[2025-11-27 00:22:22,118] [INFO] [axolotl.utils.callbacks.on_train_begin:820] [PID:80269] The DeepSpeed config has been saved to the WandB run under files.
0%| | 0/221 [00:00<?, ?it/s]
0%|▋ | 1/221 [00:10<37:28, 10.22s/it]
{'loss': 0.0756, 'grad_norm': 0.10699598491191864, 'learning_rate': 0.0, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.81, 'tokens_per_second_per_gpu': 336.12, 'epoch': 0.0}
0%|▋ | 1/221 [00:10<37:28, 10.22s/it]
1%|█▍ | 2/221 [00:17<31:29, 8.63s/it]
{'loss': 0.069, 'grad_norm': 0.10141075402498245, 'learning_rate': 9.090909090909091e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 499.67, 'epoch': 0.01}
1%|█▍ | 2/221 [00:17<31:29, 8.63s/it]
1%|██ | 3/221 [00:24<28:45, 7.92s/it]
{'loss': 0.0881, 'grad_norm': 0.11926735192537308, 'learning_rate': 1.8181818181818182e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 511.33, 'epoch': 0.01}
1%|██ | 3/221 [00:24<28:45, 7.92s/it]
2%|██▊ | 4/221 [00:31<27:22, 7.57s/it]
{'loss': 0.0793, 'grad_norm': 0.11467798799276352, 'learning_rate': 2.7272727272727273e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.69, 'epoch': 0.02}
2%|██▊ | 4/221 [00:31<27:22, 7.57s/it]
2%|███▍ | 5/221 [00:38<26:33, 7.38s/it]
{'loss': 0.0894, 'grad_norm': 0.10370815545320511, 'learning_rate': 3.6363636363636364e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 327.47, 'epoch': 0.02}
2%|███▍ | 5/221 [00:38<26:33, 7.38s/it]
3%|████▏ | 6/221 [00:45<26:02, 7.27s/it]
{'loss': 0.0653, 'grad_norm': 0.0778045579791069, 'learning_rate': 4.545454545454546e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 438.39, 'epoch': 0.03}
3%|████▏ | 6/221 [00:45<26:02, 7.27s/it]
3%|████▊ | 7/221 [00:53<25:48, 7.23s/it]
{'loss': 0.0604, 'grad_norm': 0.05470091104507446, 'learning_rate': 5.4545454545454546e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 454.21, 'epoch': 0.03}
3%|████▊ | 7/221 [00:53<25:48, 7.23s/it]
4%|█████▌ | 8/221 [01:00<25:36, 7.21s/it]
{'loss': 0.0575, 'grad_norm': 0.04792458191514015, 'learning_rate': 6.363636363636364e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 381.85, 'epoch': 0.04}
4%|█████▌ | 8/221 [01:00<25:36, 7.21s/it]
4%|██████▏ | 9/221 [01:07<25:21, 7.18s/it]
{'loss': 0.057, 'grad_norm': 0.04809016361832619, 'learning_rate': 7.272727272727273e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 395.18, 'epoch': 0.04}
4%|██████▏ | 9/221 [01:07<25:21, 7.18s/it]
5%|██████▉ | 10/221 [01:14<25:06, 7.14s/it]
{'loss': 0.0472, 'grad_norm': 0.05050504207611084, 'learning_rate': 8.181818181818183e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 437.16, 'epoch': 0.05}
5%|██████▉ | 10/221 [01:14<25:06, 7.14s/it]
5%|███████▌ | 11/221 [01:21<25:01, 7.15s/it]
{'loss': 0.0422, 'grad_norm': 0.057043518871068954, 'learning_rate': 9.090909090909092e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 386.23, 'epoch': 0.05}
5%|███████▌ | 11/221 [01:21<25:01, 7.15s/it]
5%|████████▎ | 12/221 [01:28<25:00, 7.18s/it]
{'loss': 0.033, 'grad_norm': 0.04036800563335419, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 309.64, 'epoch': 0.05}
5%|████████▎ | 12/221 [01:28<25:00, 7.18s/it]
6%|████████▉ | 13/221 [01:35<24:46, 7.15s/it]
{'loss': 0.0429, 'grad_norm': 0.03150289133191109, 'learning_rate': 9.999440509051368e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 452.59, 'epoch': 0.06}
6%|████████▉ | 13/221 [01:35<24:46, 7.15s/it]
6%|█████████▋ | 14/221 [01:43<24:39, 7.15s/it]
{'loss': 0.0381, 'grad_norm': 0.03723820298910141, 'learning_rate': 9.997762161417517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 365.29, 'epoch': 0.06}
6%|█████████▋ | 14/221 [01:43<24:39, 7.15s/it]
7%|██████▋ | 15/221 [01:50<24:29, 7.13s/it]
{'loss': 0.0445, 'grad_norm': 0.026561176404356956, 'learning_rate': 9.994965332706573e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 383.32, 'epoch': 0.07}
7%|██████▋ | 15/221 [01:50<24:29, 7.13s/it]
7%|███████ | 16/221 [01:57<24:21, 7.13s/it]
{'loss': 0.0347, 'grad_norm': 0.022102832794189453, 'learning_rate': 9.991050648838675e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 571.37, 'epoch': 0.07}
7%|███████ | 16/221 [01:57<24:21, 7.13s/it]
8%|███████▌ | 17/221 [02:04<24:15, 7.14s/it]
{'loss': 0.0332, 'grad_norm': 0.026612414047122, 'learning_rate': 9.986018985905901e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 371.09, 'epoch': 0.08}
8%|███████▌ | 17/221 [02:04<24:15, 7.14s/it]
8%|███████▉ | 18/221 [02:11<24:04, 7.11s/it]
{'loss': 0.0384, 'grad_norm': 0.02820519544184208, 'learning_rate': 9.979871469976196e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 359.8, 'epoch': 0.08}
8%|███████▉ | 18/221 [02:11<24:04, 7.11s/it]
9%|████████▍ | 19/221 [02:18<23:55, 7.11s/it]
{'loss': 0.0306, 'grad_norm': 0.06290236860513687, 'learning_rate': 9.972609476841367e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 255.21, 'epoch': 0.09}
9%|████████▍ | 19/221 [02:18<23:55, 7.11s/it]
9%|████████▊ | 20/221 [02:25<23:49, 7.11s/it]
{'loss': 0.0328, 'grad_norm': 0.02102799527347088, 'learning_rate': 9.964234631709187e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 388.05, 'epoch': 0.09}
9%|████████▊ | 20/221 [02:25<23:49, 7.11s/it]
10%|█████████▎ | 21/221 [02:32<23:46, 7.13s/it]
{'loss': 0.0336, 'grad_norm': 0.022319750860333443, 'learning_rate': 9.954748808839674e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 496.89, 'epoch': 0.1}
10%|█████████▎ | 21/221 [02:32<23:46, 7.13s/it]
10%|█████████▊ | 22/221 [02:40<23:41, 7.14s/it]
{'loss': 0.0349, 'grad_norm': 0.019568774849176407, 'learning_rate': 9.944154131125642e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 417.97, 'epoch': 0.1}
10%|█████████▊ | 22/221 [02:40<23:41, 7.14s/it]
10%|██████████▏ | 23/221 [02:47<23:30, 7.12s/it]
{'loss': 0.0382, 'grad_norm': 0.04317627474665642, 'learning_rate': 9.932452969617607e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 377.6, 'epoch': 0.1}
10%|██████████▏ | 23/221 [02:47<23:30, 7.12s/it]
11%|██████████▋ | 24/221 [02:54<23:32, 7.17s/it]
{'loss': 0.0361, 'grad_norm': 0.027220861986279488, 'learning_rate': 9.919647942993148e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 468.41, 'epoch': 0.11}
11%|██████████▋ | 24/221 [02:54<23:32, 7.17s/it]
11%|███████████ | 25/221 [03:01<23:27, 7.18s/it]
{'loss': 0.031, 'grad_norm': 0.019090518355369568, 'learning_rate': 9.905741916970864e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.96, 'epoch': 0.11}
11%|███████████ | 25/221 [03:01<23:27, 7.18s/it]
12%|███████████▌ | 26/221 [03:09<23:34, 7.26s/it]
{'loss': 0.0316, 'grad_norm': 0.019753405824303627, 'learning_rate': 9.890738003669029e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 385.75, 'epoch': 0.12}
12%|███████████▌ | 26/221 [03:09<23:34, 7.26s/it]
12%|███████████▉ | 27/221 [03:16<23:29, 7.26s/it]
{'loss': 0.0402, 'grad_norm': 0.021183036267757416, 'learning_rate': 9.874639560909117e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 481.45, 'epoch': 0.12}
12%|███████████▉ | 27/221 [03:16<23:29, 7.26s/it]
13%|████████████▍ | 28/221 [03:23<23:16, 7.23s/it]
{'loss': 0.0312, 'grad_norm': 0.018204571679234505, 'learning_rate': 9.857450191464337e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 317.6, 'epoch': 0.13}
13%|████████████▍ | 28/221 [03:23<23:16, 7.23s/it]
13%|████████████▊ | 29/221 [03:30<23:06, 7.22s/it]
{'loss': 0.0343, 'grad_norm': 0.02151501551270485, 'learning_rate': 9.839173742253334e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 471.66, 'epoch': 0.13}
13%|████████████▊ | 29/221 [03:30<23:06, 7.22s/it]
14%|█████████████▎ | 30/221 [03:37<22:51, 7.18s/it]
{'loss': 0.0337, 'grad_norm': 0.021778756752610207, 'learning_rate': 9.819814303479267e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 391.67, 'epoch': 0.14}
14%|█████████████▎ | 30/221 [03:37<22:51, 7.18s/it]
14%|█████████████▋ | 31/221 [03:45<22:53, 7.23s/it]
{'loss': 0.0264, 'grad_norm': 0.01405468862503767, 'learning_rate': 9.799376207714445e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 441.4, 'epoch': 0.14}
14%|█████████████▋ | 31/221 [03:45<22:53, 7.23s/it]
14%|██████████████▏ | 32/221 [03:52<22:39, 7.19s/it]
{'loss': 0.0325, 'grad_norm': 0.0183633491396904, 'learning_rate': 9.777864028930705e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 449.02, 'epoch': 0.14}
14%|██████████████▏ | 32/221 [03:52<22:39, 7.19s/it]
15%|██████████████▋ | 33/221 [03:59<22:35, 7.21s/it]
{'loss': 0.0342, 'grad_norm': 0.022185783833265305, 'learning_rate': 9.755282581475769e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 356.6, 'epoch': 0.15}
15%|██████████████▋ | 33/221 [03:59<22:35, 7.21s/it]
15%|███████████████ | 34/221 [04:06<22:21, 7.17s/it]
{'loss': 0.0255, 'grad_norm': 0.015691177919507027, 'learning_rate': 9.731636918995821e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 461.92, 'epoch': 0.15}
15%|███████████████ | 34/221 [04:06<22:21, 7.17s/it]
16%|███████████████▌ | 35/221 [04:13<22:14, 7.17s/it]
{'loss': 0.0314, 'grad_norm': 0.01962122693657875, 'learning_rate': 9.706932333304517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 428.5, 'epoch': 0.16}
16%|███████████████▌ | 35/221 [04:13<22:14, 7.17s/it]
16%|███████████████▉ | 36/221 [04:20<22:05, 7.17s/it]
{'loss': 0.0396, 'grad_norm': 0.01783391274511814, 'learning_rate': 9.681174353198687e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 393.65, 'epoch': 0.16}
16%|███████████████▉ | 36/221 [04:20<22:05, 7.17s/it]
17%|████████████████▍ | 37/221 [04:28<21:57, 7.16s/it]
{'loss': 0.0351, 'grad_norm': 0.020520439371466637, 'learning_rate': 9.654368743221022e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 331.18, 'epoch': 0.17}
17%|████████████████▍ | 37/221 [04:28<21:57, 7.16s/it]
17%|████████████████▊ | 38/221 [04:35<21:47, 7.14s/it]
{'loss': 0.0341, 'grad_norm': 0.019169267266988754, 'learning_rate': 9.626521502369984e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 429.02, 'epoch': 0.17}
17%|████████████████▊ | 38/221 [04:35<21:47, 7.14s/it]
18%|█████████████████▎ | 39/221 [04:42<21:41, 7.15s/it]
{'loss': 0.0351, 'grad_norm': 0.02138075977563858, 'learning_rate': 9.597638862757255e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 491.69, 'epoch': 0.18}
18%|█████████████████▎ | 39/221 [04:42<21:41, 7.15s/it]
18%|█████████████████▋ | 40/221 [04:49<21:37, 7.17s/it]
{'loss': 0.0338, 'grad_norm': 0.0176653191447258, 'learning_rate': 9.567727288213005e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.26, 'epoch': 0.18}
18%|█████████████████▋ | 40/221 [04:49<21:37, 7.17s/it]
19%|██████████████████▏ | 41/221 [04:56<21:29, 7.17s/it]
{'loss': 0.0316, 'grad_norm': 0.017243385314941406, 'learning_rate': 9.536793472839325e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.46, 'epoch': 0.19}
19%|██████████████████▏ | 41/221 [04:56<21:29, 7.17s/it]
19%|██████████████████▌ | 42/221 [05:03<21:18, 7.14s/it]
{'loss': 0.0216, 'grad_norm': 0.0146207669749856, 'learning_rate': 9.504844339512095e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 363.65, 'epoch': 0.19}
19%|██████████████████▌ | 42/221 [05:03<21:18, 7.14s/it]
19%|███████████████████ | 43/221 [05:10<21:11, 7.14s/it]
{'loss': 0.0335, 'grad_norm': 0.017516395077109337, 'learning_rate': 9.471887038331685e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.46, 'epoch': 0.19}
19%|███████████████████ | 43/221 [05:10<21:11, 7.14s/it]
20%|███████████████████▌ | 44/221 [05:18<21:05, 7.15s/it]
{'loss': 0.0313, 'grad_norm': 0.01834929920732975, 'learning_rate': 9.437928945022771e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.41, 'epoch': 0.2}
20%|███████████████████▌ | 44/221 [05:18<21:05, 7.15s/it]
20%|███████████████████▉ | 45/221 [05:25<20:58, 7.15s/it]
{'loss': 0.0312, 'grad_norm': 0.018410420045256615, 'learning_rate': 9.40297765928369e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 414.21, 'epoch': 0.2}
20%|███████████████████▉ | 45/221 [05:25<20:58, 7.15s/it]
21%|████████████████████▍ | 46/221 [05:32<20:43, 7.11s/it]
{'loss': 0.0335, 'grad_norm': 0.043539997190237045, 'learning_rate': 9.367041003085649e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 319.79, 'epoch': 0.21}
21%|████████████████████▍ | 46/221 [05:32<20:43, 7.11s/it]
21%|████████████████████▊ | 47/221 [05:39<20:40, 7.13s/it]
{'loss': 0.0307, 'grad_norm': 0.019783005118370056, 'learning_rate': 9.330127018922194e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 262.29, 'epoch': 0.21}
21%|████████████████████▊ | 47/221 [05:39<20:40, 7.13s/it]
22%|█████████████████████▎ | 48/221 [05:46<20:31, 7.12s/it]
{'loss': 0.0308, 'grad_norm': 0.018382525071501732, 'learning_rate': 9.292243968009331e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.22}
22%|█████████████████████▎ | 48/221 [05:46<20:31, 7.12s/it]
22%|█████████████████████▋ | 49/221 [05:53<20:25, 7.13s/it]
{'loss': 0.0338, 'grad_norm': 0.021564122289419174, 'learning_rate': 9.253400328436699e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 484.09, 'epoch': 0.22}
22%|█████████████████████▋ | 49/221 [05:53<20:25, 7.13s/it]
23%|██████████████████████▏ | 50/221 [06:00<20:16, 7.12s/it]
{'loss': 0.0285, 'grad_norm': 0.016710789874196053, 'learning_rate': 9.213604793270196e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.23}
23%|██████████████████████▏ | 50/221 [06:00<20:16, 7.12s/it]
23%|██████████████████████▌ | 51/221 [06:07<20:10, 7.12s/it]
{'loss': 0.0293, 'grad_norm': 0.016314025968313217, 'learning_rate': 9.172866268606513e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.09, 'epoch': 0.23}
23%|██████████████████████▌ | 51/221 [06:07<20:10, 7.12s/it]
24%|███████████████████████ | 52/221 [06:14<20:02, 7.12s/it]
{'loss': 0.0273, 'grad_norm': 0.01764376275241375, 'learning_rate': 9.131193871579975e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 558.97, 'epoch': 0.24}
24%|███████████████████████ | 52/221 [06:14<20:02, 7.12s/it]
24%|███████████████████████▌ | 53/221 [06:21<19:49, 7.08s/it]
{'loss': 0.0245, 'grad_norm': 0.01896459050476551, 'learning_rate': 9.088596928322158e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.06, 'epoch': 0.24}
24%|███████████████████████▌ | 53/221 [06:21<19:49, 7.08s/it]
24%|███████████████████████▉ | 54/221 [06:29<19:48, 7.12s/it]
{'loss': 0.0301, 'grad_norm': 0.01687958650290966, 'learning_rate': 9.045084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 296.5, 'epoch': 0.24}
24%|███████████████████████▉ | 54/221 [06:29<19:48, 7.12s/it]
25%|████████████████████████▍ | 55/221 [06:36<19:41, 7.12s/it]
{'loss': 0.0341, 'grad_norm': 0.021479196846485138, 'learning_rate': 9.000667740056032e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.19, 'epoch': 0.25}
25%|████████████████████████▍ | 55/221 [06:36<19:41, 7.12s/it]
25%|████████████████████████▊ | 56/221 [06:43<19:39, 7.15s/it]
{'loss': 0.0271, 'grad_norm': 0.016711527481675148, 'learning_rate': 8.955355173281708e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.2, 'epoch': 0.25}
25%|████████████████████████▊ | 56/221 [06:43<19:39, 7.15s/it]
26%|█████████████████████████▎ | 57/221 [06:50<19:37, 7.18s/it]
{'loss': 0.0287, 'grad_norm': 0.01733219437301159, 'learning_rate': 8.90915741234015e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.43, 'epoch': 0.26}
26%|█████████████████████████▎ | 57/221 [06:50<19:37, 7.18s/it]
26%|█████████████████████████▋ | 58/221 [06:57<19:27, 7.16s/it]
{'loss': 0.0254, 'grad_norm': 0.01601138710975647, 'learning_rate': 8.862084796122998e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.26}
26%|█████████████████████████▋ | 58/221 [06:57<19:27, 7.16s/it]
27%|██████████████████████████▏ | 59/221 [07:04<19:16, 7.14s/it]
{'loss': 0.0311, 'grad_norm': 0.018599703907966614, 'learning_rate': 8.814147859311332e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.27}
27%|██████████████████████████▏ | 59/221 [07:04<19:16, 7.14s/it]
27%|██████████████████████████▌ | 60/221 [07:12<19:09, 7.14s/it]
{'loss': 0.0309, 'grad_norm': 0.01615062914788723, 'learning_rate': 8.765357330018056e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.15, 'epoch': 0.27}
27%|██████████████████████████▌ | 60/221 [07:12<19:09, 7.14s/it][2025-11-27 00:29:44,068] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-60
28%|███████████████████████████ | 61/221 [07:31<28:50, 10.82s/it]
{'loss': 0.0245, 'grad_norm': 0.01592225581407547, 'learning_rate': 8.715724127386972e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.76, 'epoch': 0.28}
28%|███████████████████████████ | 61/221 [07:31<28:50, 10.82s/it]
28%|███████████████████████████▍ | 62/221 [07:38<25:36, 9.66s/it]
{'loss': 0.0295, 'grad_norm': 0.02008405141532421, 'learning_rate': 8.665259359149132e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 447.91, 'epoch': 0.28}
28%|███████████████████████████▍ | 62/221 [07:38<25:36, 9.66s/it]
29%|███████████████████████████▉ | 63/221 [07:45<23:25, 8.90s/it]
{'loss': 0.0366, 'grad_norm': 0.019492069259285927, 'learning_rate': 8.613974319136958e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 405.86, 'epoch': 0.29}
29%|███████████████████████████▉ | 63/221 [07:45<23:25, 8.90s/it]
29%|████████████████████████████▍ | 64/221 [07:52<21:51, 8.36s/it]
{'loss': 0.0284, 'grad_norm': 0.02178225666284561, 'learning_rate': 8.561880484756725e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.25, 'epoch': 0.29}
29%|████████████████████████████▍ | 64/221 [07:52<21:51, 8.36s/it]
29%|████████████████████████████▊ | 65/221 [07:59<20:46, 7.99s/it]
{'loss': 0.0297, 'grad_norm': 0.019038653001189232, 'learning_rate': 8.508989514419958e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 345.76, 'epoch': 0.29}
29%|████████████████████████████▊ | 65/221 [07:59<20:46, 7.99s/it]
30%|█████████████████████████████▎ | 66/221 [08:07<20:04, 7.77s/it]
{'loss': 0.0293, 'grad_norm': 0.01684654876589775, 'learning_rate': 8.455313244934324e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.88, 'epoch': 0.3}
30%|█████████████████████████████▎ | 66/221 [08:07<20:04, 7.77s/it]
30%|█████████████████████████████▋ | 67/221 [08:14<19:25, 7.57s/it]
{'loss': 0.0239, 'grad_norm': 0.01636500470340252, 'learning_rate': 8.400863688854597e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.77, 'epoch': 0.3}
30%|█████████████████████████████▋ | 67/221 [08:14<19:25, 7.57s/it]
31%|██████████████████████████████▏ | 68/221 [08:21<18:56, 7.43s/it]
{'loss': 0.0263, 'grad_norm': 0.020848819985985756, 'learning_rate': 8.345653031794292e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 365.36, 'epoch': 0.31}
31%|██████████████████████████████▏ | 68/221 [08:21<18:56, 7.43s/it]
31%|██████████████████████████████▌ | 69/221 [08:28<18:33, 7.33s/it]
{'loss': 0.0355, 'grad_norm': 0.02269025892019272, 'learning_rate': 8.289693629698564e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.76, 'epoch': 0.31}
31%|██████████████████████████████▌ | 69/221 [08:28<18:33, 7.33s/it]
32%|███████████████████████████████ | 70/221 [08:35<18:16, 7.26s/it]
{'loss': 0.0284, 'grad_norm': 0.01883563958108425, 'learning_rate': 8.232998006078997e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.65, 'epoch': 0.32}
32%|███████████████████████████████ | 70/221 [08:35<18:16, 7.26s/it]
32%|███████████████████████████████▍ | 71/221 [08:42<17:58, 7.19s/it]
{'loss': 0.0261, 'grad_norm': 0.017690833657979965, 'learning_rate': 8.175578849210895e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 441.77, 'epoch': 0.32}
32%|███████████████████████████████▍ | 71/221 [08:42<17:58, 7.19s/it]
33%|███████████████████████████████▉ | 72/221 [08:49<17:49, 7.17s/it]
{'loss': 0.0282, 'grad_norm': 0.018213583156466484, 'learning_rate': 8.117449009293668e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.18, 'epoch': 0.33}
33%|███████████████████████████████▉ | 72/221 [08:49<17:49, 7.17s/it]
33%|████████████████████████████████▎ | 73/221 [08:56<17:36, 7.14s/it]
{'loss': 0.0309, 'grad_norm': 0.019336581230163574, 'learning_rate': 8.058621495575032e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.23, 'epoch': 0.33}
33%|████████████████████████████████▎ | 73/221 [08:56<17:36, 7.14s/it]
33%|████████████████████████████████▊ | 74/221 [09:03<17:25, 7.11s/it]
{'loss': 0.0312, 'grad_norm': 0.019243910908699036, 'learning_rate': 7.999109473439569e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.43, 'epoch': 0.33}
33%|████████████████████████████████▊ | 74/221 [09:03<17:25, 7.11s/it]
34%|█████████████████████████████████▎ | 75/221 [09:10<17:17, 7.11s/it]
{'loss': 0.0317, 'grad_norm': 0.01973150111734867, 'learning_rate': 7.938926261462366e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.63, 'epoch': 0.34}
34%|█████████████████████████████████▎ | 75/221 [09:10<17:17, 7.11s/it]
34%|█████████████████████████████████▋ | 76/221 [09:18<17:15, 7.14s/it]
{'loss': 0.0293, 'grad_norm': 0.02182990498840809, 'learning_rate': 7.878085328428369e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.11, 'epoch': 0.34}
34%|█████████████████████████████████▋ | 76/221 [09:18<17:15, 7.14s/it]
35%|██████████████████████████████████▏ | 77/221 [09:25<17:06, 7.13s/it]
{'loss': 0.0287, 'grad_norm': 0.018669869750738144, 'learning_rate': 7.81660029031811e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 408.44, 'epoch': 0.35}
35%|██████████████████████████████████▏ | 77/221 [09:25<17:06, 7.13s/it]
35%|██████████████████████████████████▌ | 78/221 [09:32<16:59, 7.13s/it]
{'loss': 0.0315, 'grad_norm': 0.016969047486782074, 'learning_rate': 7.754484907260513e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.64, 'epoch': 0.35}
35%|██████████████████████████████████▌ | 78/221 [09:32<16:59, 7.13s/it]
36%|███████████████████████████████████ | 79/221 [09:39<16:48, 7.10s/it]
{'loss': 0.0278, 'grad_norm': 0.019713636487722397, 'learning_rate': 7.691753080453412e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 397.76, 'epoch': 0.36}
36%|███████████████████████████████████ | 79/221 [09:39<16:48, 7.10s/it]
36%|███████████████████████████████████▍ | 80/221 [09:46<16:45, 7.13s/it]
{'loss': 0.0255, 'grad_norm': 0.017600620165467262, 'learning_rate': 7.628418849052523e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.33, 'epoch': 0.36}
36%|███████████████████████████████████▍ | 80/221 [09:46<16:45, 7.13s/it]
37%|███████████████████████████████████▉ | 81/221 [09:53<16:36, 7.12s/it]
{'loss': 0.0234, 'grad_norm': 0.018615400418639183, 'learning_rate': 7.564496387029532e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 352.61, 'epoch': 0.37}
37%|███████████████████████████████████▉ | 81/221 [09:53<16:36, 7.12s/it]
37%|████████████████████████████████████▎ | 82/221 [10:00<16:27, 7.11s/it]
{'loss': 0.0234, 'grad_norm': 0.023312179371714592, 'learning_rate': 7.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 499.51, 'epoch': 0.37}
37%|████████████████████████████████████▎ | 82/221 [10:00<16:27, 7.11s/it]
38%|████████████████████████████████████▊ | 83/221 [10:07<16:23, 7.13s/it]
{'loss': 0.0321, 'grad_norm': 0.01922520436346531, 'learning_rate': 7.434944122021836e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 419.08, 'epoch': 0.38}
38%|████████████████████████████████████▊ | 83/221 [10:07<16:23, 7.13s/it]
38%|█████████████████████████████████████▏ | 84/221 [10:14<16:11, 7.09s/it]
{'loss': 0.0254, 'grad_norm': 0.019153179600834846, 'learning_rate': 7.369343312364993e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.78, 'epoch': 0.38}
38%|█████████████████████████████████████▏ | 84/221 [10:14<16:11, 7.09s/it]
38%|█████████████████████████████████████▋ | 85/221 [10:22<16:12, 7.15s/it]
{'loss': 0.0343, 'grad_norm': 0.020285822451114655, 'learning_rate': 7.303212252253162e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 371.99, 'epoch': 0.38}
38%|█████████████████████████████████████▋ | 85/221 [10:22<16:12, 7.15s/it]
39%|██████████████████████████████████████▏ | 86/221 [10:29<15:59, 7.11s/it]
{'loss': 0.0249, 'grad_norm': 0.016675548627972603, 'learning_rate': 7.236565741578163e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.7, 'epoch': 0.39}
39%|██████████████████████████████████████▏ | 86/221 [10:29<15:59, 7.11s/it]
39%|██████████████████████████████████████▌ | 87/221 [10:36<15:59, 7.16s/it]
{'loss': 0.0289, 'grad_norm': 0.015159820206463337, 'learning_rate': 7.169418695587791e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 344.64, 'epoch': 0.39}
39%|██████████████████████████████████████▌ | 87/221 [10:36<15:59, 7.16s/it]
40%|███████████████████████████████████████ | 88/221 [10:43<15:52, 7.16s/it]
{'loss': 0.0276, 'grad_norm': 0.018055099993944168, 'learning_rate': 7.101786141547828e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 375.32, 'epoch': 0.4}
40%|███████████████████████████████████████ | 88/221 [10:43<15:52, 7.16s/it]
40%|███████████████████████████████████████▍ | 89/221 [10:50<15:42, 7.14s/it]
{'loss': 0.0277, 'grad_norm': 0.01955697126686573, 'learning_rate': 7.033683215379002e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.83, 'epoch': 0.4}
40%|███████████████████████████████████████▍ | 89/221 [10:50<15:42, 7.14s/it]
41%|███████████████████████████████████████▉ | 90/221 [10:57<15:36, 7.15s/it]
{'loss': 0.0277, 'grad_norm': 0.01860162802040577, 'learning_rate': 6.965125158269619e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.84, 'epoch': 0.41}
41%|███████████████████████████████████████▉ | 90/221 [10:57<15:36, 7.15s/it]
41%|████████████████████████████████████████▎ | 91/221 [11:04<15:25, 7.12s/it]
{'loss': 0.0322, 'grad_norm': 0.02057529240846634, 'learning_rate': 6.896127313264643e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.85, 'epoch': 0.41}
41%|████████████████████████████████████████▎ | 91/221 [11:04<15:25, 7.12s/it]
42%|████████████████████████████████████████▊ | 92/221 [11:11<15:15, 7.09s/it]
{'loss': 0.0228, 'grad_norm': 0.017251698300242424, 'learning_rate': 6.826705121831976e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 483.38, 'epoch': 0.42}
42%|████████████████████████████████████████▊ | 92/221 [11:11<15:15, 7.09s/it]
42%|█████████████████████████████████████████▏ | 93/221 [11:19<15:21, 7.20s/it]
{'loss': 0.028, 'grad_norm': 0.017092842608690262, 'learning_rate': 6.756874120406714e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.82, 'epoch': 0.42}
42%|█████████████████████████████████████████▏ | 93/221 [11:19<15:21, 7.20s/it]
43%|█████████████████████████████████████████▋ | 94/221 [11:26<15:15, 7.21s/it]
{'loss': 0.0272, 'grad_norm': 0.01863975077867508, 'learning_rate': 6.686649936914152e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 455.96, 'epoch': 0.43}
43%|█████████████████████████████████████████▋ | 94/221 [11:26<15:15, 7.21s/it]
43%|██████████████████████████████████████████▏ | 95/221 [11:33<15:06, 7.19s/it]
{'loss': 0.0256, 'grad_norm': 0.019126810133457184, 'learning_rate': 6.616048287272301e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.43}
43%|██████████████████████████████████████████▏ | 95/221 [11:33<15:06, 7.19s/it]
43%|██████████████████████████████████████████▌ | 96/221 [11:40<14:59, 7.20s/it]
{'loss': 0.0293, 'grad_norm': 0.019856387749314308, 'learning_rate': 6.545084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 330.99, 'epoch': 0.43}
43%|██████████████████████████████████████████▌ | 96/221 [11:40<14:59, 7.20s/it]
44%|███████████████████████████████████████████ | 97/221 [11:48<14:48, 7.17s/it]
{'loss': 0.0298, 'grad_norm': 0.020938578993082047, 'learning_rate': 6.473775872054521e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.5, 'epoch': 0.44}
44%|███████████████████████████████████████████ | 97/221 [11:48<14:48, 7.17s/it]
44%|███████████████████████████████████████████▍ | 98/221 [11:55<14:41, 7.17s/it]
{'loss': 0.0213, 'grad_norm': 0.01743321865797043, 'learning_rate': 6.402136946530014e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.25, 'epoch': 0.44}
44%|███████████████████████████████████████████▍ | 98/221 [11:55<14:41, 7.17s/it]
45%|███████████████████████████████████████████▉ | 99/221 [12:02<14:37, 7.19s/it]
{'loss': 0.0289, 'grad_norm': 0.03026910126209259, 'learning_rate': 6.330184227833376e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.35, 'epoch': 0.45}
45%|███████████████████████████████████████████▉ | 99/221 [12:02<14:37, 7.19s/it]
45%|███████████████████████████████████████████▉ | 100/221 [12:09<14:22, 7.13s/it]
{'loss': 0.0255, 'grad_norm': 0.021303489804267883, 'learning_rate': 6.257933818722543e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.68, 'epoch': 0.45}
45%|███████████████████████████████████████████▉ | 100/221 [12:09<14:22, 7.13s/it]
46%|████████████████████████████████████████████▎ | 101/221 [12:16<14:14, 7.12s/it]
{'loss': 0.0302, 'grad_norm': 0.018962478265166283, 'learning_rate': 6.185401888577488e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.8, 'epoch': 0.46}
46%|████████████████████████████████████████████▎ | 101/221 [12:16<14:14, 7.12s/it]
46%|████████████████████████████████████████████▊ | 102/221 [12:23<14:09, 7.14s/it]
{'loss': 0.0233, 'grad_norm': 0.01941424049437046, 'learning_rate': 6.112604669781572e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 468.02, 'epoch': 0.46}
46%|████████████████████████████████████████████▊ | 102/221 [12:23<14:09, 7.14s/it]
47%|█████████████████████████████████████████████▏ | 103/221 [12:30<14:02, 7.14s/it]
{'loss': 0.0295, 'grad_norm': 0.019837241619825363, 'learning_rate': 6.0395584540887963e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.34, 'epoch': 0.47}
47%|█████████████████████████████████████████████▏ | 103/221 [12:30<14:02, 7.14s/it]
47%|█████████████████████████████████████████████▋ | 104/221 [12:38<13:54, 7.13s/it]
{'loss': 0.0259, 'grad_norm': 0.018758604303002357, 'learning_rate': 5.9662795889777666e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 337.38, 'epoch': 0.47}
47%|█████████████████████████████████████████████▋ | 104/221 [12:38<13:54, 7.13s/it]
48%|██████████████████████████████████████████████ | 105/221 [12:45<13:43, 7.10s/it]
{'loss': 0.0274, 'grad_norm': 0.01769891194999218, 'learning_rate': 5.8927844739931834e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.82, 'epoch': 0.48}
48%|██████████████████████████████████████████████ | 105/221 [12:45<13:43, 7.10s/it]
48%|██████████████████████████████████████████████▌ | 106/221 [12:52<13:41, 7.14s/it]
{'loss': 0.0265, 'grad_norm': 0.017575478181242943, 'learning_rate': 5.819089557075689e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 473.38, 'epoch': 0.48}
48%|██████████████████████████████████████████████▌ | 106/221 [12:52<13:41, 7.14s/it]
48%|██████████████████████████████████████████████▉ | 107/221 [12:59<13:30, 7.11s/it]
{'loss': 0.0261, 'grad_norm': 0.017795337364077568, 'learning_rate': 5.745211330880872e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.08, 'epoch': 0.48}
48%|██████████████████████████████████████████████▉ | 107/221 [12:59<13:30, 7.11s/it]
49%|███████████████████████████████████████████████▍ | 108/221 [13:06<13:21, 7.09s/it]
{'loss': 0.0312, 'grad_norm': 0.021093547344207764, 'learning_rate': 5.6711663290882776e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.72, 'epoch': 0.49}
49%|███████████████████████████████████████████████▍ | 108/221 [13:06<13:21, 7.09s/it]
49%|███████████████████████████████████████████████▊ | 109/221 [13:13<13:10, 7.06s/it]
{'loss': 0.0238, 'grad_norm': 0.022809553891420364, 'learning_rate': 5.596971122701221e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 504.0, 'epoch': 0.49}
49%|███████████████████████████████████████████████▊ | 109/221 [13:13<13:10, 7.06s/it]
50%|████████████████████████████████████████████████▎ | 110/221 [13:20<13:05, 7.08s/it]
{'loss': 0.0267, 'grad_norm': 0.018646899610757828, 'learning_rate': 5.522642316338268e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.5}
50%|████████████████████████████████████████████████▎ | 110/221 [13:20<13:05, 7.08s/it]
50%|████████████████████████████████████████████████▋ | 111/221 [13:27<13:01, 7.10s/it]
{'loss': 0.0253, 'grad_norm': 0.0172793660312891, 'learning_rate': 5.448196544517168e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.68, 'epoch': 0.5}
50%|████████████████████████████████████████████████▋ | 111/221 [13:27<13:01, 7.10s/it]
51%|█████████████████████████████████████████████████▏ | 112/221 [13:34<12:57, 7.14s/it]
{'loss': 0.0335, 'grad_norm': 0.019996505230665207, 'learning_rate': 5.373650467932122e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.18, 'epoch': 0.51}
51%|█████████████████████████████████████████████████▏ | 112/221 [13:34<12:57, 7.14s/it]
51%|█████████████████████████████████████████████████▌ | 113/221 [13:42<12:51, 7.14s/it]
{'loss': 0.0333, 'grad_norm': 0.017304031178355217, 'learning_rate': 5.299020769725172e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.94, 'epoch': 0.51}
51%|█████████████████████████████████████████████████▌ | 113/221 [13:42<12:51, 7.14s/it]
52%|██████████████████████████████████████████████████ | 114/221 [13:49<12:43, 7.13s/it]
{'loss': 0.027, 'grad_norm': 0.01827949658036232, 'learning_rate': 5.2243241517525754e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 286.28, 'epoch': 0.52}
52%|██████████████████████████████████████████████████ | 114/221 [13:49<12:43, 7.13s/it]
52%|██████████████████████████████████████████████████▍ | 115/221 [13:56<12:34, 7.12s/it]
{'loss': 0.0252, 'grad_norm': 0.018098153173923492, 'learning_rate': 5.149577330846993e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.73, 'epoch': 0.52}
52%|██████████████████████████████████████████████████▍ | 115/221 [13:56<12:34, 7.12s/it]
52%|██████████████████████████████████████████████████▉ | 116/221 [14:03<12:28, 7.13s/it]
{'loss': 0.0229, 'grad_norm': 0.015578909777104855, 'learning_rate': 5.074797035076319e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.63, 'epoch': 0.52}
52%|██████████████████████████████████████████████████▉ | 116/221 [14:03<12:28, 7.13s/it]
53%|███████████████████████████████████████████████████▎ | 117/221 [14:10<12:22, 7.14s/it]
{'loss': 0.0283, 'grad_norm': 0.01797802932560444, 'learning_rate': 5e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.61, 'epoch': 0.53}
53%|███████████████████████████████████████████████████▎ | 117/221 [14:10<12:22, 7.14s/it]
53%|███████████████████████████████████████████████████▊ | 118/221 [14:17<12:14, 7.13s/it]
{'loss': 0.0259, 'grad_norm': 0.018971417099237442, 'learning_rate': 4.925202964923683e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.12, 'epoch': 0.53}
53%|███████████████████████████████████████████████████▊ | 118/221 [14:17<12:14, 7.13s/it]
54%|████████████████████████████████████████████████████▏ | 119/221 [14:24<12:09, 7.15s/it]
{'loss': 0.0265, 'grad_norm': 0.019693924114108086, 'learning_rate': 4.850422669153009e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 549.79, 'epoch': 0.54}
54%|████████████████████████████████████████████████████▏ | 119/221 [14:24<12:09, 7.15s/it]
54%|████████████████████████████████████████████████████▋ | 120/221 [14:31<12:02, 7.15s/it]
{'loss': 0.0277, 'grad_norm': 0.020947441458702087, 'learning_rate': 4.775675848247427e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.17, 'epoch': 0.54}
54%|████████████████████████████████████████████████████▋ | 120/221 [14:31<12:02, 7.15s/it][2025-11-27 00:37:03,937] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-120
55%|█████████████████████████████████████████████████████ | 121/221 [14:51<17:59, 10.80s/it]
{'loss': 0.0249, 'grad_norm': 0.01684478297829628, 'learning_rate': 4.700979230274829e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.17, 'epoch': 0.55}
55%|█████████████████████████████████████████████████████ | 121/221 [14:51<17:59, 10.80s/it]
55%|█████████████████████████████████████████████████████▌ | 122/221 [14:58<16:01, 9.72s/it]
{'loss': 0.0289, 'grad_norm': 0.019412320107221603, 'learning_rate': 4.626349532067879e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 450.57, 'epoch': 0.55}
55%|█████████████████████████████████████████████████████▌ | 122/221 [14:58<16:01, 9.72s/it]
56%|█████████████████████████████████████████████████████▉ | 123/221 [15:05<14:31, 8.89s/it]
{'loss': 0.0246, 'grad_norm': 0.018697615712881088, 'learning_rate': 4.551803455482833e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 310.05, 'epoch': 0.56}
56%|█████████████████████████████████████████████████████▉ | 123/221 [15:05<14:31, 8.89s/it]
56%|██████████████████████████████████████████████████████▍ | 124/221 [15:12<13:31, 8.36s/it]
{'loss': 0.0245, 'grad_norm': 0.01853892020881176, 'learning_rate': 4.477357683661734e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 448.58, 'epoch': 0.56}
56%|██████████████████████████████████████████████████████▍ | 124/221 [15:12<13:31, 8.36s/it]
57%|██████████████████████████████████████████████████████▊ | 125/221 [15:19<12:44, 7.96s/it]
{'loss': 0.0257, 'grad_norm': 0.017834430560469627, 'learning_rate': 4.403028877298779e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 379.27, 'epoch': 0.57}
57%|██████████████████████████████████████████████████████▊ | 125/221 [15:19<12:44, 7.96s/it]
57%|███████████████████████████████████████████████████████▎ | 126/221 [15:26<12:16, 7.75s/it]
{'loss': 0.027, 'grad_norm': 0.018574297428131104, 'learning_rate': 4.328833670911724e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.42, 'epoch': 0.57}
57%|███████████████████████████████████████████████████████▎ | 126/221 [15:26<12:16, 7.75s/it]
57%|███████████████████████████████████████████████████████▋ | 127/221 [15:34<11:51, 7.57s/it]
{'loss': 0.0333, 'grad_norm': 0.019024794921278954, 'learning_rate': 4.254788669119127e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 458.35, 'epoch': 0.57}
57%|███████████████████████████████████████████████████████▋ | 127/221 [15:34<11:51, 7.57s/it]
58%|████████████████████████████████████████████████████████▏ | 128/221 [15:41<11:30, 7.43s/it]
{'loss': 0.0247, 'grad_norm': 0.018406303599476814, 'learning_rate': 4.180910442924312e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.39, 'epoch': 0.58}
58%|████████████████████████████████████████████████████████▏ | 128/221 [15:41<11:30, 7.43s/it]
58%|████████████████████████████████████████████████████████▌ | 129/221 [15:48<11:15, 7.34s/it]
{'loss': 0.0217, 'grad_norm': 0.016275746747851372, 'learning_rate': 4.107215526006817e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 485.26, 'epoch': 0.58}
58%|████████████████████████████████████████████████████████▌ | 129/221 [15:48<11:15, 7.34s/it]
59%|█████████████████████████████████████████████████████████ | 130/221 [15:55<10:59, 7.25s/it]
{'loss': 0.0284, 'grad_norm': 0.018617160618305206, 'learning_rate': 4.0337204110222346e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.41, 'epoch': 0.59}
59%|█████████████████████████████████████████████████████████ | 130/221 [15:55<10:59, 7.25s/it]
59%|█████████████████████████████████████████████████████████▍ | 131/221 [16:02<10:48, 7.20s/it]
{'loss': 0.0216, 'grad_norm': 0.01993851736187935, 'learning_rate': 3.960441545911204e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.14, 'epoch': 0.59}
59%|█████████████████████████████████████████████████████████▍ | 131/221 [16:02<10:48, 7.20s/it]
60%|█████████████████████████████████████████████████████████▉ | 132/221 [16:09<10:38, 7.17s/it]
{'loss': 0.025, 'grad_norm': 0.017185868695378304, 'learning_rate': 3.887395330218429e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 476.36, 'epoch': 0.6}
60%|█████████████████████████████████████████████████████████▉ | 132/221 [16:09<10:38, 7.17s/it]
60%|██████████████████████████████████████████████████████████▍ | 133/221 [16:16<10:25, 7.11s/it]
{'loss': 0.0264, 'grad_norm': 0.018661336973309517, 'learning_rate': 3.814598111422513e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.55, 'epoch': 0.6}
60%|██████████████████████████████████████████████████████████▍ | 133/221 [16:16<10:25, 7.11s/it]
61%|██████████████████████████████████████████████████████████▊ | 134/221 [16:23<10:18, 7.10s/it]
{'loss': 0.0274, 'grad_norm': 0.022303791716694832, 'learning_rate': 3.742066181277458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.2, 'epoch': 0.61}
61%|██████████████████████████████████████████████████████████▊ | 134/221 [16:23<10:18, 7.10s/it]
61%|███████████████████████████████████████████████████████████▎ | 135/221 [16:30<10:15, 7.16s/it]
{'loss': 0.0217, 'grad_norm': 0.0167496707290411, 'learning_rate': 3.6698157721666246e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.61}
61%|███████████████████████████████████████████████████████████▎ | 135/221 [16:30<10:15, 7.16s/it]
62%|███████████████████████████████████████████████████████████▋ | 136/221 [16:37<10:08, 7.16s/it]
{'loss': 0.0219, 'grad_norm': 0.016045598313212395, 'learning_rate': 3.597863053469987e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 463.19, 'epoch': 0.62}
62%|███████████████████████████████████████████████████████████▋ | 136/221 [16:37<10:08, 7.16s/it]
62%|████████████████████████████████████████████████████████████▏ | 137/221 [16:45<09:59, 7.13s/it]
{'loss': 0.027, 'grad_norm': 0.017510127276182175, 'learning_rate': 3.5262241279454785e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.64, 'epoch': 0.62}
62%|████████████████████████████████████████████████████████████▏ | 137/221 [16:45<09:59, 7.13s/it]
62%|████████████████████████████████████████████████████████████▌ | 138/221 [16:52<09:54, 7.16s/it]
{'loss': 0.0308, 'grad_norm': 0.02389226295053959, 'learning_rate': 3.4549150281252636e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.1, 'epoch': 0.62}
62%|████████████████████████████████████████████████████████████▌ | 138/221 [16:52<09:54, 7.16s/it]
63%|█████████████████████████████████████████████████████████████ | 139/221 [16:59<09:50, 7.20s/it]
{'loss': 0.0275, 'grad_norm': 0.01793692260980606, 'learning_rate': 3.383951712727701e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.85, 'epoch': 0.63}
63%|█████████████████████████████████████████████████████████████ | 139/221 [16:59<09:50, 7.20s/it]
63%|█████████████████████████████████████████████████████████████▍ | 140/221 [17:06<09:39, 7.16s/it]
{'loss': 0.0294, 'grad_norm': 0.02539198286831379, 'learning_rate': 3.313350063085851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.2, 'epoch': 0.63}
63%|█████████████████████████████████████████████████████████████▍ | 140/221 [17:06<09:39, 7.16s/it]
64%|█████████████████████████████████████████████████████████████▉ | 141/221 [17:13<09:30, 7.13s/it]
{'loss': 0.0273, 'grad_norm': 0.019213683903217316, 'learning_rate': 3.243125879593286e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.57, 'epoch': 0.64}
64%|█████████████████████████████████████████████████████████████▉ | 141/221 [17:13<09:30, 7.13s/it]
64%|██████████████████████████████████████████████████████████████▎ | 142/221 [17:20<09:21, 7.11s/it]
{'loss': 0.0357, 'grad_norm': 0.022743066772818565, 'learning_rate': 3.173294878168025e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.85, 'epoch': 0.64}
64%|██████████████████████████████████████████████████████████████▎ | 142/221 [17:20<09:21, 7.11s/it]
65%|██████████████████████████████████████████████████████████████▊ | 143/221 [17:27<09:17, 7.14s/it]
{'loss': 0.0258, 'grad_norm': 0.018310556188225746, 'learning_rate': 3.103872686735358e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 307.54, 'epoch': 0.65}
65%|██████████████████████████████████████████████████████████████▊ | 143/221 [17:27<09:17, 7.14s/it]
65%|███████████████████████████████████████████████████████████████▏ | 144/221 [17:35<09:08, 7.12s/it]
{'loss': 0.0265, 'grad_norm': 0.01981915533542633, 'learning_rate': 3.0348748417303823e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 489.51, 'epoch': 0.65}
65%|███████████████████████████████████████████████████████████████▏ | 144/221 [17:35<09:08, 7.12s/it]
66%|███████████████████████████████████████████████████████████████▋ | 145/221 [17:42<08:58, 7.08s/it]
{'loss': 0.032, 'grad_norm': 0.01881423592567444, 'learning_rate': 2.9663167846209998e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.97, 'epoch': 0.66}
66%|███████████████████████████████████████████████████████████████▋ | 145/221 [17:42<08:58, 7.08s/it]
66%|████████████████████████████████████████████████████████████████ | 146/221 [17:49<08:52, 7.10s/it]
{'loss': 0.029, 'grad_norm': 0.017727544531226158, 'learning_rate': 2.8982138584521735e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 475.56, 'epoch': 0.66}
66%|████████████████████████████████████████████████████████████████ | 146/221 [17:49<08:52, 7.10s/it]
67%|████████████████████████████████████████████████████████████████▌ | 147/221 [17:56<08:45, 7.10s/it]
{'loss': 0.0292, 'grad_norm': 0.01951543428003788, 'learning_rate': 2.8305813044122097e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 376.39, 'epoch': 0.67}
67%|████████████████████████████████████████████████████████████████▌ | 147/221 [17:56<08:45, 7.10s/it]
67%|████████████████████████████████████████████████████████████████▉ | 148/221 [18:03<08:37, 7.09s/it]
{'loss': 0.0272, 'grad_norm': 0.018047522753477097, 'learning_rate': 2.7634342584218365e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 514.29, 'epoch': 0.67}
67%|████████████████████████████████████████████████████████████████▉ | 148/221 [18:03<08:37, 7.09s/it]
67%|█████████████████████████████████████████████████████████████████▍ | 149/221 [18:10<08:35, 7.16s/it]
{'loss': 0.0295, 'grad_norm': 0.020815616473555565, 'learning_rate': 2.6967877477468397e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.84, 'epoch': 0.67}
67%|█████████████████████████████████████████████████████████████████▍ | 149/221 [18:10<08:35, 7.16s/it]
68%|█████████████████████████████████████████████████████████████████▊ | 150/221 [18:17<08:26, 7.13s/it]
{'loss': 0.0303, 'grad_norm': 0.02059975638985634, 'learning_rate': 2.630656687635007e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.18, 'epoch': 0.68}
68%|█████████████████████████████████████████████████████████████████▊ | 150/221 [18:17<08:26, 7.13s/it]
68%|██████████████████████████████████████████████████████████████████▎ | 151/221 [18:24<08:19, 7.13s/it]
{'loss': 0.0297, 'grad_norm': 0.019998600706458092, 'learning_rate': 2.5650558779781635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.86, 'epoch': 0.68}
68%|██████████████████████████████████████████████████████████████████▎ | 151/221 [18:24<08:19, 7.13s/it]
69%|██████████████████████████████████████████████████████████████████▋ | 152/221 [18:32<08:15, 7.19s/it]
{'loss': 0.033, 'grad_norm': 0.022024452686309814, 'learning_rate': 2.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.49, 'epoch': 0.69}
69%|██████████████████████████████████████████████████████████████████▋ | 152/221 [18:32<08:15, 7.19s/it]
69%|███████████████████████████████████████████████████████████████████▏ | 153/221 [18:39<08:13, 7.26s/it]
{'loss': 0.0278, 'grad_norm': 0.019334938377141953, 'learning_rate': 2.43550361297047e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.99, 'epoch': 0.69}
69%|███████████████████████████████████████████████████████████████████▏ | 153/221 [18:39<08:13, 7.26s/it]
70%|███████████████████████████████████████████████████████████████████▌ | 154/221 [18:46<08:04, 7.23s/it]
{'loss': 0.0294, 'grad_norm': 0.02994287945330143, 'learning_rate': 2.371581150947476e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.31, 'epoch': 0.7}
70%|███████████████████████████████████████████████████████████████████▌ | 154/221 [18:46<08:04, 7.23s/it]
70%|████████████████████████████████████████████████████████████████████ | 155/221 [18:53<07:54, 7.19s/it]
{'loss': 0.0228, 'grad_norm': 0.021970966830849648, 'learning_rate': 2.3082469195465893e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.03, 'epoch': 0.7}
70%|████████████████████████████████████████████████████████████████████ | 155/221 [18:53<07:54, 7.19s/it]
71%|████████████████████████████████████████████████████████████████████▍ | 156/221 [19:00<07:44, 7.15s/it]
{'loss': 0.0225, 'grad_norm': 0.017728326842188835, 'learning_rate': 2.245515092739488e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 474.23, 'epoch': 0.71}
71%|████████████████████████████████████████████████████████████████████▍ | 156/221 [19:00<07:44, 7.15s/it]
71%|████████████████████████████████████████████████████████████████████▉ | 157/221 [19:08<07:38, 7.17s/it]
{'loss': 0.0206, 'grad_norm': 0.016926869750022888, 'learning_rate': 2.1833997096818898e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.82, 'epoch': 0.71}
71%|████████████████████████████████████████████████████████████████████▉ | 157/221 [19:08<07:38, 7.17s/it]
71%|█████████████████████████████████████████████████████████████████████▎ | 158/221 [19:15<07:29, 7.14s/it]
{'loss': 0.0281, 'grad_norm': 0.029541337862610817, 'learning_rate': 2.1219146715716332e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.89, 'epoch': 0.71}
71%|█████████████████████████████████████████████████████████████████████▎ | 158/221 [19:15<07:29, 7.14s/it]
72%|█████████████████████████████████████████████████████████████████████▊ | 159/221 [19:22<07:23, 7.15s/it]
{'loss': 0.0234, 'grad_norm': 0.01689094677567482, 'learning_rate': 2.061073738537635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 387.99, 'epoch': 0.72}
72%|█████████████████████████████████████████████████████████████████████▊ | 159/221 [19:22<07:23, 7.15s/it]
72%|██████████████████████████████████████████████████████████████████████▏ | 160/221 [19:29<07:14, 7.13s/it]
{'loss': 0.033, 'grad_norm': 0.01850043050944805, 'learning_rate': 2.0008905265604316e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.06, 'epoch': 0.72}
72%|██████████████████████████████████████████████████████████████████████▏ | 160/221 [19:29<07:14, 7.13s/it]
73%|██████████████████████████████████████████████████████████████████████▋ | 161/221 [19:36<07:07, 7.12s/it]
{'loss': 0.033, 'grad_norm': 0.020465202629566193, 'learning_rate': 1.9413785044249678e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.54, 'epoch': 0.73}
73%|██████████████████████████████████████████████████████████████████████▋ | 161/221 [19:36<07:07, 7.12s/it]
73%|███████████████████████████████████████████████████████████████████████ | 162/221 [19:43<07:00, 7.12s/it]
{'loss': 0.0302, 'grad_norm': 0.019559573382139206, 'learning_rate': 1.8825509907063327e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 482.08, 'epoch': 0.73}
73%|███████████████████████████████████████████████████████████████████████ | 162/221 [19:43<07:00, 7.12s/it]
74%|███████████████████████████████████████████████████████████████████████▌ | 163/221 [19:50<06:51, 7.10s/it]
{'loss': 0.0202, 'grad_norm': 0.016423381865024567, 'learning_rate': 1.8244211507891063e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 425.58, 'epoch': 0.74}
74%|███████████████████████████████████████████████████████████████████████▌ | 163/221 [19:50<06:51, 7.10s/it]
74%|███████████████████████████████████████████████████████████████████████▉ | 164/221 [19:57<06:44, 7.10s/it]
{'loss': 0.0257, 'grad_norm': 0.01980419084429741, 'learning_rate': 1.7670019939210024e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.46, 'epoch': 0.74}
74%|███████████████████████████████████████████████████████████████████████▉ | 164/221 [19:57<06:44, 7.10s/it]
75%|████████████████████████████████████████████████████████████████████████▍ | 165/221 [20:05<06:39, 7.13s/it]
{'loss': 0.025, 'grad_norm': 0.021348468959331512, 'learning_rate': 1.7103063703014372e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 498.15, 'epoch': 0.75}
75%|████████████████████████████████████████████████████████████████████████▍ | 165/221 [20:05<06:39, 7.13s/it]
75%|████████████████████████████████████████████████████████████████████████▊ | 166/221 [20:12<06:29, 7.09s/it]
{'loss': 0.026, 'grad_norm': 0.01638958230614662, 'learning_rate': 1.6543469682057106e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.25, 'epoch': 0.75}
75%|████████████████████████████████████████████████████████████████████████▊ | 166/221 [20:12<06:29, 7.09s/it]
76%|█████████████████████████████████████████████████████████████████████████▎ | 167/221 [20:19<06:23, 7.10s/it]
{'loss': 0.0306, 'grad_norm': 0.02299441583454609, 'learning_rate': 1.599136311145402e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.47, 'epoch': 0.76}
76%|█████████████████████████████████████████████████████████████████████████▎ | 167/221 [20:19<06:23, 7.10s/it]
76%|█████████████████████████████████████████████████████████████████████████▋ | 168/221 [20:26<06:15, 7.09s/it]
{'loss': 0.025, 'grad_norm': 0.017598189413547516, 'learning_rate': 1.544686755065677e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.71, 'epoch': 0.76}
76%|█████████████████████████████████████████████████████████████████████████▋ | 168/221 [20:26<06:15, 7.09s/it]
76%|██████████████████████████████████████████████████████████████████████████▏ | 169/221 [20:33<06:10, 7.13s/it]
{'loss': 0.0236, 'grad_norm': 0.01685059629380703, 'learning_rate': 1.4910104855800427e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.43, 'epoch': 0.76}
76%|██████████████████████████████████████████████████████████████████████████▏ | 169/221 [20:33<06:10, 7.13s/it]
77%|██████████████████████████████████████████████████████████████████████████▌ | 170/221 [20:40<06:03, 7.13s/it]
{'loss': 0.0272, 'grad_norm': 0.018304958939552307, 'learning_rate': 1.438119515243277e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 451.61, 'epoch': 0.77}
77%|██████████████████████████████████████████████████████████████████████████▌ | 170/221 [20:40<06:03, 7.13s/it]
77%|███████████████████████████████████████████████████████████████████████████ | 171/221 [20:47<05:56, 7.12s/it]
{'loss': 0.022, 'grad_norm': 0.018485043197870255, 'learning_rate': 1.3860256808630428e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.29, 'epoch': 0.77}
77%|███████████████████████████████████████████████████████████████████████████ | 171/221 [20:47<05:56, 7.12s/it]
78%|███████████████████████████████████████████████████████████████████████████▍ | 172/221 [20:54<05:50, 7.15s/it]
{'loss': 0.0244, 'grad_norm': 0.018180640414357185, 'learning_rate': 1.3347406408508695e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 401.39, 'epoch': 0.78}
78%|███████████████████████████████████████████████████████████████████████████▍ | 172/221 [20:54<05:50, 7.15s/it]
78%|███████████████████████████████████████████████████████████████████████████▉ | 173/221 [21:01<05:42, 7.13s/it]
{'loss': 0.0284, 'grad_norm': 0.018684981390833855, 'learning_rate': 1.2842758726130283e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.88, 'epoch': 0.78}
78%|███████████████████████████████████████████████████████████████████████████▉ | 173/221 [21:01<05:42, 7.13s/it]
79%|████████████████████████████████████████████████████████████████████████████▎ | 174/221 [21:09<05:36, 7.16s/it]
{'loss': 0.0289, 'grad_norm': 0.021512368693947792, 'learning_rate': 1.2346426699819458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 383.13, 'epoch': 0.79}
79%|████████████████████████████████████████████████████████████████████████████▎ | 174/221 [21:09<05:36, 7.16s/it]
79%|████████████████████████████████████████████████████████████████████████████▊ | 175/221 [21:16<05:29, 7.15s/it]
{'loss': 0.0279, 'grad_norm': 0.023360926657915115, 'learning_rate': 1.1858521406886675e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.55, 'epoch': 0.79}
79%|████████████████████████████████████████████████████████████████████████████▊ | 175/221 [21:16<05:29, 7.15s/it]
80%|█████████████████████████████████████████████████████████████████████████████▏ | 176/221 [21:23<05:21, 7.14s/it]
{'loss': 0.0279, 'grad_norm': 0.022415969520807266, 'learning_rate': 1.137915203877003e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.44, 'epoch': 0.8}
80%|█████████████████████████████████████████████████████████████████████████████▏ | 176/221 [21:23<05:21, 7.14s/it]
80%|█████████████████████████████████████████████████████████████████████████████▋ | 177/221 [21:30<05:14, 7.16s/it]
{'loss': 0.0256, 'grad_norm': 0.01842794381082058, 'learning_rate': 1.090842587659851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.33, 'epoch': 0.8}
80%|█████████████████████████████████████████████████████████████████████████████▋ | 177/221 [21:30<05:14, 7.16s/it]
81%|██████████████████████████████████████████████████████████████████████████████▏ | 178/221 [21:37<05:07, 7.15s/it]
{'loss': 0.0282, 'grad_norm': 0.021043118089437485, 'learning_rate': 1.0446448267182952e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 398.08, 'epoch': 0.81}
81%|██████████████████████████████████████████████████████████████████████████████▏ | 178/221 [21:37<05:07, 7.15s/it]
81%|██████████████████████████████████████████████████████████████████████████████▌ | 179/221 [21:44<05:01, 7.17s/it]
{'loss': 0.0266, 'grad_norm': 0.020777752622961998, 'learning_rate': 9.993322599439692e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.9, 'epoch': 0.81}
81%|██████████████████████████████████████████████████████████████████████████████▌ | 179/221 [21:44<05:01, 7.17s/it]
81%|███████████████████████████████████████████████████████████████████████████████ | 180/221 [21:52<04:52, 7.14s/it]
{'loss': 0.0365, 'grad_norm': 0.026739781722426414, 'learning_rate': 9.549150281252633e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.35, 'epoch': 0.81}
81%|███████████████████████████████████████████████████████████████████████████████ | 180/221 [21:52<04:52, 7.14s/it][2025-11-27 00:44:23,881] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-180
82%|███████████████████████████████████████████████████████████████████████████████▍ | 181/221 [22:11<07:11, 10.79s/it]
{'loss': 0.0358, 'grad_norm': 0.01902214251458645, 'learning_rate': 9.114030716778433e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.08, 'epoch': 0.82}
82%|███████████████████████████████████████████████████████████████████████████████▍ | 181/221 [22:11<07:11, 10.79s/it]
82%|███████████████████████████████████████████████████████████████████████████████▉ | 182/221 [22:18<06:19, 9.74s/it]
{'loss': 0.0249, 'grad_norm': 0.019700270146131516, 'learning_rate': 8.688061284200266e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.96, 'epoch': 0.82}
82%|███████████████████████████████████████████████████████████████████████████████▉ | 182/221 [22:18<06:19, 9.74s/it]
83%|████████████████████████████████████████████████████████████████████████████████▎ | 183/221 [22:25<05:40, 8.95s/it]
{'loss': 0.0259, 'grad_norm': 0.021064477041363716, 'learning_rate': 8.271337313934869e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.67, 'epoch': 0.83}
83%|████████████████████████████████████████████████████████████████████████████████▎ | 183/221 [22:25<05:40, 8.95s/it]
83%|████████████████████████████████████████████████████████████████████████████████▊ | 184/221 [22:33<05:13, 8.47s/it]
{'loss': 0.0291, 'grad_norm': 0.01918896846473217, 'learning_rate': 7.863952067298042e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 285.99, 'epoch': 0.83}
83%|████████████████████████████████████████████████████████████████████████████████▊ | 184/221 [22:33<05:13, 8.47s/it]
84%|█████████████████████████████████████████████████████████████████████████████████▏ | 185/221 [22:40<04:50, 8.08s/it]
{'loss': 0.0298, 'grad_norm': 0.020457495003938675, 'learning_rate': 7.465996715633028e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 322.48, 'epoch': 0.84}
84%|█████████████████████████████████████████████████████████████████████████████████▏ | 185/221 [22:40<04:50, 8.08s/it]
84%|█████████████████████████████████████████████████████████████████████████████████▋ | 186/221 [22:47<04:32, 7.77s/it]
{'loss': 0.0276, 'grad_norm': 0.019534621387720108, 'learning_rate': 7.077560319906695e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 527.31, 'epoch': 0.84}
84%|█████████████████████████████████████████████████████████████████████████████████▋ | 186/221 [22:47<04:32, 7.77s/it]
85%|██████████████████████████████████████████████████████████████████████████████████ | 187/221 [22:54<04:16, 7.55s/it]
{'loss': 0.0284, 'grad_norm': 0.019534330815076828, 'learning_rate': 6.698729810778065e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.99, 'epoch': 0.85}
85%|██████████████████████████████████████████████████████████████████████████████████ | 187/221 [22:54<04:16, 7.55s/it]
85%|██████████████████████████████████████████████████████████████████████████████████▌ | 188/221 [23:01<04:04, 7.42s/it]
{'loss': 0.0306, 'grad_norm': 0.032107334583997726, 'learning_rate': 6.329589969143518e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.76, 'epoch': 0.85}
85%|██████████████████████████████████████████████████████████████████████████████████▌ | 188/221 [23:01<04:04, 7.42s/it]
86%|██████████████████████████████████████████████████████████████████████████████████▉ | 189/221 [23:08<03:56, 7.38s/it]
{'loss': 0.0412, 'grad_norm': 0.019100667908787727, 'learning_rate': 5.9702234071631e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.21, 'epoch': 0.86}
86%|██████████████████████████████████████████████████████████████████████████████████▉ | 189/221 [23:08<03:56, 7.38s/it]
86%|███████████████████████████████████████████████████████████████████████████████████▍ | 190/221 [23:15<03:46, 7.31s/it]
{'loss': 0.027, 'grad_norm': 0.01997012086212635, 'learning_rate': 5.620710549772295e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 335.96, 'epoch': 0.86}
86%|███████████████████████████████████████████████████████████████████████████████████▍ | 190/221 [23:15<03:46, 7.31s/it]
86%|███████████████████████████████████████████████████████████████████████████████████▊ | 191/221 [23:22<03:37, 7.25s/it]
{'loss': 0.0248, 'grad_norm': 0.01957276090979576, 'learning_rate': 5.281129616683167e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 355.72, 'epoch': 0.86}
86%|███████████████████████████████████████████████████████████████████████████████████▊ | 191/221 [23:23<03:37, 7.25s/it]
87%|████████████████████████████████████████████████████████████████████████████████████▎ | 192/221 [23:30<03:29, 7.22s/it]
{'loss': 0.0266, 'grad_norm': 0.019423488527536392, 'learning_rate': 4.951556604879048e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.6, 'epoch': 0.87}
87%|████████████████████████████████████████████████████████████████████████████████████▎ | 192/221 [23:30<03:29, 7.22s/it]
87%|████████████████████████████████████████████████████████████████████████████████████▋ | 193/221 [23:37<03:22, 7.25s/it]
{'loss': 0.0246, 'grad_norm': 0.021110599860548973, 'learning_rate': 4.632065271606756e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.45, 'epoch': 0.87}
87%|████████████████████████████████████████████████████████████████████████████████████▋ | 193/221 [23:37<03:22, 7.25s/it]
88%|█████████████████████████████████████████████████████████████████████████████████████▏ | 194/221 [23:44<03:14, 7.21s/it]
{'loss': 0.026, 'grad_norm': 0.02191292867064476, 'learning_rate': 4.322727117869951e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 404.69, 'epoch': 0.88}
88%|█████████████████████████████████████████████████████████████████████████████████████▏ | 194/221 [23:44<03:14, 7.21s/it]
88%|█████████████████████████████████████████████████████████████████████████████████████▌ | 195/221 [23:51<03:05, 7.15s/it]
{'loss': 0.0277, 'grad_norm': 0.018202103674411774, 'learning_rate': 4.023611372427471e-06, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 501.69, 'epoch': 0.88}
88%|█████████████████████████████████████████████████████████████████████████████████████▌ | 195/221 [23:51<03:05, 7.15s/it]
89%|██████████████████████████████████████████████████████████████████████████████████████ | 196/221 [23:58<02:58, 7.13s/it]
{'loss': 0.0203, 'grad_norm': 0.016622671857476234, 'learning_rate': 3.734784976300165e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.02, 'epoch': 0.89}
89%|██████████████████████████████████████████████████████████████████████████████████████ | 196/221 [23:58<02:58, 7.13s/it]
89%|██████████████████████████████████████████████████████████████████████████████████████▍ | 197/221 [24:05<02:51, 7.16s/it]
{'loss': 0.0281, 'grad_norm': 0.019572410732507706, 'learning_rate': 3.4563125677897932e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.87, 'epoch': 0.89}
89%|██████████████████████████████████████████████████████████████████████████████████████▍ | 197/221 [24:05<02:51, 7.16s/it]
90%|██████████████████████████████████████████████████████████████████████████████████████▉ | 198/221 [24:13<02:44, 7.17s/it]
{'loss': 0.0272, 'grad_norm': 0.02384166046977043, 'learning_rate': 3.18825646801314e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.89, 'epoch': 0.9}
90%|██████████████████████████████████████████████████████████████████████████████████████▉ | 198/221 [24:13<02:44, 7.17s/it]
90%|███████████████████████████████████████████████████████████████████████████████████████▎ | 199/221 [24:20<02:37, 7.14s/it]
{'loss': 0.0291, 'grad_norm': 0.022356677800416946, 'learning_rate': 2.930676666954846e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.48, 'epoch': 0.9}
90%|███████████████████████████████████████████████████████████████████████████████████████▎ | 199/221 [24:20<02:37, 7.14s/it]
90%|███████████████████████████████████████████████████████████████████████████████████████▊ | 200/221 [24:27<02:30, 7.15s/it]
{'loss': 0.0354, 'grad_norm': 0.024130800738930702, 'learning_rate': 2.6836308100417873e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.26, 'epoch': 0.9}
90%|███████████████████████████████████████████████████████████████████████████████████████▊ | 200/221 [24:27<02:30, 7.15s/it]
91%|████████████████████████████████████████████████████████████████████████████████████████▏ | 201/221 [24:34<02:23, 7.17s/it]
{'loss': 0.0267, 'grad_norm': 0.021222814917564392, 'learning_rate': 2.4471741852423237e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 494.07, 'epoch': 0.91}
91%|████████████████████████████████████████████████████████████████████████████████████████▏ | 201/221 [24:34<02:23, 7.17s/it]
91%|████████████████████████████████████████████████████████████████████████████████████████▋ | 202/221 [24:41<02:15, 7.16s/it]
{'loss': 0.0307, 'grad_norm': 0.01970573328435421, 'learning_rate': 2.221359710692961e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.36, 'epoch': 0.91}
91%|████████████████████████████████████████████████████████████████████████████████████████▋ | 202/221 [24:41<02:15, 7.16s/it]
92%|█████████████████████████████████████████████████████████████████████████████████████████ | 203/221 [24:48<02:08, 7.13s/it]
{'loss': 0.0234, 'grad_norm': 0.01854623667895794, 'learning_rate': 2.006237922855553e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 394.22, 'epoch': 0.92}
92%|█████████████████████████████████████████████████████████████████████████████████████████ | 203/221 [24:48<02:08, 7.13s/it]
92%|█████████████████████████████████████████████████████████████████████████████████████████▌ | 204/221 [24:55<02:01, 7.15s/it]
{'loss': 0.0241, 'grad_norm': 0.0231945738196373, 'learning_rate': 1.8018569652073381e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.08, 'epoch': 0.92}
92%|█████████████████████████████████████████████████████████████████████████████████████████▌ | 204/221 [24:55<02:01, 7.15s/it]
93%|█████████████████████████████████████████████████████████████████████████████████████████▉ | 205/221 [25:03<01:54, 7.13s/it]
{'loss': 0.0264, 'grad_norm': 0.018703971058130264, 'learning_rate': 1.6082625774666794e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 471.75, 'epoch': 0.93}
93%|█████████████████████████████████████████████████████████████████████████████████████████▉ | 205/221 [25:03<01:54, 7.13s/it]
93%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 206/221 [25:10<01:47, 7.14s/it]
{'loss': 0.0251, 'grad_norm': 0.017928369343280792, 'learning_rate': 1.4254980853566247e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.2, 'epoch': 0.93}
93%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 206/221 [25:10<01:47, 7.14s/it]
94%|██████████████████████████████████████████████████████████████████████████████████████████▊ | 207/221 [25:17<01:39, 7.13s/it]
{'loss': 0.0239, 'grad_norm': 0.018282128497958183, 'learning_rate': 1.2536043909088191e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.9, 'epoch': 0.94}
94%|██████████████████████████████████████████████████████████████████████████████████████████▊ | 207/221 [25:17<01:39, 7.13s/it]
94%|███████████████████████████████████████████████████████████████████████████████████████████▎ | 208/221 [25:24<01:32, 7.12s/it]
{'loss': 0.0307, 'grad_norm': 0.021411525085568428, 'learning_rate': 1.0926199633097157e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 382.68, 'epoch': 0.94}
94%|███████████████████████████████████████████████████████████████████████████████████████████▎ | 208/221 [25:24<01:32, 7.12s/it]
95%|███████████████████████████████████████████████████████████████████████████████████████████▋ | 209/221 [25:31<01:25, 7.17s/it]
{'loss': 0.0287, 'grad_norm': 0.02065850794315338, 'learning_rate': 9.42580830291373e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 431.06, 'epoch': 0.95}
95%|███████████████████████████████████████████████████████████████████████████████████████████▋ | 209/221 [25:31<01:25, 7.17s/it]
95%|████████████████████████████████████████████████████████████████████████████████████████████▏ | 210/221 [25:39<01:19, 7.21s/it]
{'loss': 0.0212, 'grad_norm': 0.019915733486413956, 'learning_rate': 8.035205700685167e-07, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 418.82, 'epoch': 0.95}
95%|████████████████████████████████████████████████████████████████████████████████████████████▏ | 210/221 [25:39<01:19, 7.21s/it]
95%|████████████████████████████████████████████████████████████████████████████████████████████▌ | 211/221 [25:46<01:11, 7.18s/it]
{'loss': 0.0264, 'grad_norm': 0.020451124757528305, 'learning_rate': 6.75470303823933e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.8, 'epoch': 0.95}
95%|████████████████████████████████████████████████████████████████████████████████████████████▌ | 211/221 [25:46<01:11, 7.18s/it]
96%|█████████████████████████████████████████████████████████████████████████████████████████████ | 212/221 [25:53<01:04, 7.17s/it]
{'loss': 0.0268, 'grad_norm': 0.02076980657875538, 'learning_rate': 5.584586887435739e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.6, 'epoch': 0.96}
96%|█████████████████████████████████████████████████████████████████████████████████████████████ | 212/221 [25:53<01:04, 7.17s/it]
96%|█████████████████████████████████████████████████████████████████████████████████████████████▍ | 213/221 [26:00<00:57, 7.14s/it]
{'loss': 0.0246, 'grad_norm': 0.019138546660542488, 'learning_rate': 4.52511911603265e-07, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 364.67, 'epoch': 0.96}
96%|█████████████████████████████████████████████████████████████████████████████████████████████▍ | 213/221 [26:00<00:57, 7.14s/it]
97%|█████████████████████████████████████████████████████████████████████████████████████████████▉ | 214/221 [26:07<00:49, 7.11s/it]
{'loss': 0.0348, 'grad_norm': 0.026033613830804825, 'learning_rate': 3.576536829081323e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 505.19, 'epoch': 0.97}
97%|█████████████████████████████████████████████████████████████████████████████████████████████▉ | 214/221 [26:07<00:49, 7.11s/it]
97%|██████████████████████████████████████████████████████████████████████████████████████████████▎ | 215/221 [26:14<00:42, 7.11s/it]
{'loss': 0.0308, 'grad_norm': 0.01909700781106949, 'learning_rate': 2.7390523158633554e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 402.37, 'epoch': 0.97}
97%|██████████████████████████████████████████████████████████████████████████████████████████████▎ | 215/221 [26:14<00:42, 7.11s/it]
98%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 216/221 [26:21<00:35, 7.17s/it]
{'loss': 0.0251, 'grad_norm': 0.017447378486394882, 'learning_rate': 2.012853002380466e-07, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.02, 'epoch': 0.98}
98%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 216/221 [26:21<00:35, 7.17s/it]
98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28, 7.16s/it]
{'loss': 0.0294, 'grad_norm': 0.0193310659378767, 'learning_rate': 1.3981014094099353e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 392.76, 'epoch': 0.98}
98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28, 7.16s/it]
99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21, 7.17s/it]
{'loss': 0.0286, 'grad_norm': 0.023237884044647217, 'learning_rate': 8.949351161324227e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 444.9, 'epoch': 0.99}
99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21, 7.17s/it]
99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14, 7.15s/it]
{'loss': 0.0264, 'grad_norm': 0.017193371430039406, 'learning_rate': 5.0346672934270534e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.84, 'epoch': 0.99}
99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14, 7.15s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07, 7.13s/it]
{'loss': 0.0311, 'grad_norm': 0.01859343983232975, 'learning_rate': 2.237838582483387e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 512.81, 'epoch': 1.0}
100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07, 7.13s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00, 7.17s/it]
{'loss': 0.0256, 'grad_norm': 0.021252349019050598, 'learning_rate': 5.594909486328348e-09, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.96, 'epoch': 1.0}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00, 7.17s/it][2025-11-27 00:49:29,268] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-221
{'train_runtime': 1631.9607, 'train_samples_per_second': 17.334, 'train_steps_per_second': 0.135, 'train_loss': 0.030637798653873383, 'memory/max_active (GiB)': 15.75, 'memory/max_allocated (GiB)': 15.75, 'memory/device_reserved (GiB)': 50.97, 'epoch': 1.0}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00, 7.17s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00, 7.37s/it]
[2025-11-27 00:49:33,160] [INFO] [axolotl.train.save_trained_model:218] [PID:80269] Training completed! Saving trained model to ./nov262025-sc-LoRA-Run.
[2025-11-27 00:49:33,820] [INFO] [axolotl.train.save_trained_model:336] [PID:80269] Model successfully saved to ./nov262025-sc-LoRA-Run
|