diff --git "a/debug.log" "b/debug.log"
--- "a/debug.log"
+++ "b/debug.log"
@@ -1,26 +1,27 @@
-[2025-10-18 19:02:01,879] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:42363] baseline 0.000GB ()
-[2025-10-18 19:02:01,880] [INFO] [axolotl.cli.config.load_cfg:248] [PID:42363] config:
+[2026-03-30 13:38:13,335] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:37135] baseline 0.000GB ()
+[2026-03-30 13:38:13,336] [INFO] [axolotl.cli.config.load_cfg:341] [PID:37135] config:
 {
   "activation_offloading": false,
   "adapter": "qlora",
-  "axolotl_config_path": "mrCuddle-stream.yaml",
+  "axolotl_config_path": "rp-sft_Attention-Block-Only_Test.yml",
   "base_model": "google/gemma-2-2b-it",
   "base_model_config": "google/gemma-2-2b-it",
   "batch_size": 8,
   "bf16": true,
   "capabilities": {
     "bf16": true,
-    "compute_capability": "sm_86",
-    "fp8": false,
+    "compute_capability": "sm_100",
+    "fp8": true,
     "n_gpu": 1,
-    "n_node": 1
+    "n_node": 1,
+    "tf32": true
   },
   "context_parallel_size": 1,
   "dataloader_num_workers": 1,
   "dataloader_pin_memory": true,
   "dataloader_prefetch_factor": 256,
-  "dataset_num_proc": 12,
-  "dataset_prepared_path": "last_run_prepared",
+  "dataset_num_proc": 28,
+  "dataset_prepared_path": "/workspace/axolotl/last_run_prepared",
   "datasets": [
     {
       "chat_template": "jinja",
@@ -31,7 +32,7 @@
         "content": "value",
         "role": "from"
       },
-      "path": "AiAF/conversations",
+      "path": ".",
       "roles_to_train": [
         "assistant"
       ],
@@ -45,8 +46,10 @@
   "device": "cuda:0",
   "dion_rank_fraction": 1.0,
   "dion_rank_multiple_of": 1,
+  "eaft_alpha": 1.0,
+  "eaft_k": 20,
   "env_capabilities": {
-    "torch_version": "2.7.1"
+    "torch_version": "2.9.1"
   },
   "eot_tokens": [
     "<end_of_turn>"
@@ -63,10 +66,14 @@
   "eval_steps": 50,
   "eval_strategy": "steps",
   "eval_table_size": 0,
-  "evaluation_strategy": "steps",
   "experimental_skip_move_to_device": true,
   "flash_attention": true,
   "fp16": false,
+  "generate_samples": false,
+  "generation_do_sample": true,
+  "generation_max_new_tokens": 50,
+  "generation_prompt_ratio": 0.5,
+  "generation_temperature": 0.7,
   "gradient_accumulation_steps": 4,
   "gradient_checkpointing": true,
   "gradient_checkpointing_kwargs": {
@@ -78,6 +85,7 @@
   "is_falcon_derived_model": false,
   "is_llama_derived_model": false,
   "is_mistral_derived_model": false,
+  "layer_offloading": false,
   "learning_rate": 0.0002,
   "lisa_layers_attribute": "model.layers",
   "load_best_model_at_end": false,
@@ -87,26 +95,39 @@
   "logging_steps": 1,
   "lora_alpha": 128,
   "lora_dropout": 0.05,
+  "lora_embedding_kernel": true,
+  "lora_mlp_kernel": true,
+  "lora_o_kernel": true,
+  "lora_qkv_kernel": true,
   "lora_r": 64,
-  "lora_target_linear": true,
+  "lora_target_linear": false,
+  "lora_target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj"
+  ],
   "loraplus_lr_embedding": 1e-06,
   "lr_scheduler": "cosine",
   "max_steps": 1000,
   "mean_resizing_embeddings": false,
+  "merge_method": "memory_efficient",
   "micro_batch_size": 2,
   "model_config_type": "gemma2",
   "num_epochs": 1.0,
+  "num_generation_samples": 3,
   "optimizer": "adamw_bnb_8bit",
-  "output_dir": "./outputs/sft/gemma-2-2b-it-rp-sft-qlora",
-  "pad_to_sequence_len": true,
+  "otel_metrics_host": "localhost",
+  "otel_metrics_port": 8000,
+  "output_dir": "/workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora",
   "pretrain_multipack_attn": true,
   "profiler_steps_start": 0,
   "qlora_sharded_model_loading": false,
+  "quantize_moe_experts": false,
   "ray_num_workers": 1,
   "resources_per_worker": {
     "GPU": 1
   },
-  "sample_packing": true,
   "sample_packing_bin_size": 200,
   "sample_packing_group_size": 100000,
   "save_only_model": false,
@@ -123,7 +144,6 @@
     "eos_token": "<eos>",
     "pad_token": "<pad>"
   },
-  "streaming": true,
   "streaming_multipack_buffer_size": 10000,
   "strict": false,
   "tensor_parallel_size": 1,
@@ -131,14 +151,12 @@
     {
       "chat_template": "jinja",
       "chat_template_jinja": "{{ bos_token }}\n{% for m in messages %}\n  {% set role = 'model' if m['role']=='assistant' else 'user' %}\n  {{ '<start_of_turn>' + role + '\\n' + m['content'] | trim + '<end_of_turn>\\n' }}\n{% endfor %}\n{% if add_generation_prompt %}\n{{ '<start_of_turn>model\\n' }}\n{% endif %}\n",
-      "data_files": "eval-datasets/shuf-1000_conversations_V2.jsonl",
       "field_messages": "conversations",
       "message_property_mappings": {
         "content": "value",
         "role": "from"
       },
-      "name": "json",
-      "path": ".",
+      "path": "eval-datasets/shuf-1000_conversations_V3.jsonl",
       "roles_to_train": [
         "assistant"
       ],
@@ -155,17 +173,27 @@
   "torch_dtype": "torch.bfloat16",
   "train_on_inputs": false,
   "trl": {
+    "async_prefetch": false,
     "log_completions": false,
     "mask_truncated_completions": false,
     "ref_model_mixup_alpha": 0.9,
     "ref_model_sync_steps": 64,
+    "replay_buffer_size": 0,
+    "replay_recompute_logps": true,
+    "reroll_max_groups": 1,
+    "reroll_start_fraction": 1.0,
+    "reward_num_workers": 1,
     "scale_rewards": true,
+    "skip_zero_advantage_batches": true,
     "sync_ref_model": false,
+    "use_data_producer": false,
     "use_vllm": false,
+    "vllm_lora_sync": false,
     "vllm_server_host": "0.0.0.0",
     "vllm_server_port": 8000
   },
   "type_of_model": "AutoModelForCausalLM",
+  "use_otel_metrics": false,
   "use_ray": false,
   "use_wandb": true,
   "val_set_size": 0.0,
@@ -177,242 +205,22 @@
     "port": 8000
   },
   "wandb_log_model": "false",
-  "wandb_name": "gemma-2-2b-it-rp-sft-qlora",
+  "wandb_name": "Attention-Block-Only_Test",
   "wandb_project": "rp-sft",
-  "wandb_run_id": "gemma-2-2b-it-rp-sft-qlora",
+  "wandb_run_id": "Attention-Block-Only_Test",
+  "warmup_ratio": 0.03,
   "weight_decay": 0.0,
   "world_size": 1
 }
-[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:42363] EOS: 1 / <eos>
-[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:42363] BOS: 2 / <bos>
-[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42363] PAD: 0 / <pad>
-[2025-10-18 19:02:03,610] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42363] UNK: 3 / <unk>
-[2025-10-18 19:02:17,503] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:42363] Loading prepared dataset from disk at last_run_prepared/323978649404d0f4da7e1f3e2dc7b3de...
-[2025-10-18 19:02:17,508] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:42363] Loading tokenizer... google/gemma-2-2b-it
-[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:42363] EOS: 1 / <eos>
-[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:42363] BOS: 2 / <bos>
-[2025-10-18 19:02:18,978] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42363] PAD: 0 / <pad>
-[2025-10-18 19:02:18,979] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42363] UNK: 3 / <unk>
-[2025-10-18 19:02:18,979] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:42363] Loading model
-[2025-10-18 19:02:19,156] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:42363] Patched Trainer.evaluation_loop with nanmean loss calculation
-[2025-10-18 19:02:19,159] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:42363] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
-[2025-10-18 19:02:19,160] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:42363] Applying multipack dataloader patch for sample packing...
-Loading checkpoint shards:   0%|      | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|▌| 1/2 [00:13<00:13, 13.38s/Loading checkpoint shards: 100%|█| 2/2 [00:14<00:00,  6.01s/Loading checkpoint shards: 100%|█| 2/2 [00:14<00:00,  7.12s/
-[2025-10-18 19:02:34,220] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:42363] converting PEFT model w/ prepare_model_for_kbit_training
-[2025-10-18 19:02:34,224] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:42363] Converting modules to torch.bfloat16
-[2025-10-18 19:02:34,227] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42363] Memory usage after model load 4.378GB (+4.378GB allocated, +4.486GB reserved)
-[2025-10-18 19:02:34,228] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:42363] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
-trainable params: 83,066,880 || all params: 2,697,408,768 || trainable%: 3.0795
-[2025-10-18 19:02:35,141] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42363] after adapters 2.438GB (+2.438GB allocated, +4.625GB reserved)
-[2025-10-18 19:02:42,952] [INFO] [axolotl.train.save_initial_configs:398] [PID:42363] Pre-saving adapter config to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora...
-[2025-10-18 19:02:42,952] [INFO] [axolotl.train.save_initial_configs:402] [PID:42363] Pre-saving tokenizer to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora...
-[2025-10-18 19:02:43,482] [INFO] [axolotl.train.save_initial_configs:407] [PID:42363] Pre-saving model config to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora...
-[2025-10-18 19:02:43,494] [INFO] [axolotl.train.execute_training:196] [PID:42363] Starting trainer...
-[34m[1mwandb[0m: Currently logged in as: [33mfactoryaiart[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
-[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
-[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run gemma-2-2b-it-rp-sft-qlora (0.2s)
-[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.22.2
-[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/axolotl/wandb/run-20251018_190244-gemma-2-2b-it-rp-sft-qlora[0m
-[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
-[34m[1mwandb[0m: Syncing run [33mgemma-2-2b-it-rp-sft-qlora[0m
-[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/factoryaiart/rp-sft[0m
-[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/factoryaiart/rp-sft/runs/gemma-2-2b-it-rp-sft-qlora[0m
-[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
-[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
-[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
-[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
-[2025-10-18 19:02:46,605] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:42363] The Axolotl config has been saved to the WandB run under files.
-  0%|                              | 0/1000 [00:00<?, ?it/s][2025-10-18 19:02:46,619] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 19:02:49,230] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.221949577331543
-[2025-10-18 19:02:50,452] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.221228837966919
-[2025-10-18 19:02:51,753] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3005599975585938
-[2025-10-18 19:02:53,012] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.258347511291504
-[2025-10-18 19:02:53,012] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:21,  8.31it/s][A
-  2%|▍                      | 3/179 [00:00<00:35,  4.96it/s][A
-  2%|▌                      | 4/179 [00:00<00:42,  4.11it/s][A
-  3%|▋                      | 5/179 [00:01<01:00,  2.89it/s][A
-  3%|▊                      | 6/179 [00:01<00:56,  3.06it/s][A
-  4%|▉                      | 7/179 [00:02<00:55,  3.07it/s][A
-  4%|█                      | 8/179 [00:02<00:54,  3.11it/s][A
-  5%|█▏                     | 9/179 [00:02<01:05,  2.59it/s][A
-  6%|█▏                    | 10/179 [00:03<01:00,  2.80it/s][A
-  6%|█▎                    | 11/179 [00:03<00:57,  2.90it/s][A
-  7%|█▍                    | 12/179 [00:03<00:55,  2.99it/s][A
-  7%|█▌                    | 13/179 [00:04<01:05,  2.55it/s][A
-  8%|█▋                    | 14/179 [00:04<00:59,  2.78it/s][A
-  8%|█▊                    | 15/179 [00:04<00:56,  2.89it/s][A
-  9%|█▉                    | 16/179 [00:05<00:54,  2.97it/s][A
-  9%|██                    | 17/179 [00:05<01:03,  2.54it/s][A
- 10%|██▏                   | 18/179 [00:06<00:58,  2.77it/s][A
- 11%|██▎                   | 19/179 [00:06<00:55,  2.89it/s][A
- 11%|██▍                   | 20/179 [00:06<00:53,  2.97it/s][A
- 12%|██▌                   | 21/179 [00:07<01:02,  2.55it/s][A
- 12%|██▋                   | 22/179 [00:07<00:56,  2.77it/s][A
- 13%|██▊                   | 23/179 [00:07<00:54,  2.89it/s][A
- 13%|██▉                   | 24/179 [00:08<00:52,  2.97it/s][A
- 14%|███                   | 25/179 [00:08<01:00,  2.54it/s][A
- 15%|███▏                  | 26/179 [00:08<00:55,  2.77it/s][A
- 15%|███▎                  | 27/179 [00:09<00:52,  2.88it/s][A
- 16%|███▍                  | 28/179 [00:09<00:50,  2.96it/s][A
- 16%|███▌                  | 29/179 [00:10<00:59,  2.54it/s][A
- 17%|███▋                  | 30/179 [00:10<00:53,  2.77it/s][A
- 17%|███▊                  | 31/179 [00:10<00:51,  2.87it/s][A
- 18%|███▉                  | 32/179 [00:10<00:49,  2.96it/s][A
- 18%|████                  | 33/179 [00:11<00:57,  2.52it/s][A
- 19%|████▏                 | 34/179 [00:11<00:52,  2.75it/s][A
- 20%|████▎                 | 35/179 [00:12<00:50,  2.87it/s][A
- 20%|████▍                 | 36/179 [00:12<00:48,  2.96it/s][A
- 21%|████▌                 | 37/179 [00:12<00:56,  2.53it/s][A
- 21%|████▋                 | 38/179 [00:13<00:50,  2.77it/s][A
- 22%|████▊                 | 39/179 [00:13<00:48,  2.88it/s][A
- 22%|████▉                 | 40/179 [00:13<00:46,  2.97it/s][A
- 23%|█████                 | 41/179 [00:14<00:54,  2.54it/s][A
- 23%|█████▏                | 42/179 [00:14<00:49,  2.78it/s][A
- 24%|█████▎                | 43/179 [00:14<00:46,  2.90it/s][A
- 25%|█████▍                | 44/179 [00:15<00:45,  2.97it/s][A
- 25%|█████▌                | 45/179 [00:15<00:53,  2.53it/s][A
- 26%|█████▋                | 46/179 [00:16<00:48,  2.77it/s][A
- 26%|█████▊                | 47/179 [00:16<00:45,  2.89it/s][A
- 27%|█████▉                | 48/179 [00:16<00:44,  2.96it/s][A
- 27%|██████                | 49/179 [00:17<00:51,  2.53it/s][A
- 28%|██████▏               | 50/179 [00:17<00:46,  2.76it/s][A
- 28%|██████▎               | 51/179 [00:17<00:44,  2.87it/s][A
- 29%|██████▍               | 52/179 [00:18<00:42,  2.96it/s][A
- 30%|██████▌               | 53/179 [00:18<00:49,  2.52it/s][A
- 30%|██████▋               | 54/179 [00:19<00:45,  2.76it/s][A
- 31%|██████▊               | 55/179 [00:19<00:43,  2.88it/s][A
- 31%|██████▉               | 56/179 [00:19<00:41,  2.96it/s][A
- 32%|███████               | 57/179 [00:20<00:48,  2.54it/s][A
- 32%|███████▏              | 58/179 [00:20<00:43,  2.77it/s][A
- 33%|███████▎              | 59/179 [00:20<00:41,  2.87it/s][A
- 34%|███████▎              | 60/179 [00:21<00:40,  2.94it/s][A
- 34%|███████▍              | 61/179 [00:21<00:46,  2.52it/s][A
- 35%|███████▌              | 62/179 [00:21<00:42,  2.74it/s][A
- 35%|███████▋              | 63/179 [00:22<00:40,  2.86it/s][A
- 36%|███████▊              | 64/179 [00:22<00:39,  2.95it/s][A
- 36%|███████▉              | 65/179 [00:23<00:45,  2.52it/s][A
- 37%|████████              | 66/179 [00:23<00:41,  2.75it/s][A
- 37%|████████▏             | 67/179 [00:23<00:39,  2.85it/s][A
- 38%|████████▎             | 68/179 [00:23<00:37,  2.93it/s][A
- 39%|████████▍             | 69/179 [00:24<00:44,  2.50it/s][A
- 39%|████████▌             | 70/179 [00:24<00:39,  2.75it/s][A
- 40%|████████▋             | 71/179 [00:25<00:37,  2.85it/s][A
- 40%|████████▊             | 72/179 [00:25<00:36,  2.94it/s][A
- 41%|████████▉             | 73/179 [00:25<00:42,  2.51it/s][A
- 41%|█████████             | 74/179 [00:26<00:38,  2.74it/s][A
- 42%|█████████▏            | 75/179 [00:26<00:36,  2.87it/s][A
- 42%|█████████▎            | 76/179 [00:26<00:34,  2.94it/s][A
- 43%|█████████▍            | 77/179 [00:27<00:42,  2.38it/s][A
- 44%|█████████▌            | 78/179 [00:27<00:38,  2.63it/s][A
- 44%|█████████▋            | 79/179 [00:28<00:36,  2.75it/s][A
- 45%|█████████▊            | 80/179 [00:28<00:34,  2.85it/s][A
- 45%|█████████▉            | 81/179 [00:28<00:40,  2.43it/s][A
- 46%|██████████            | 82/179 [00:29<00:36,  2.68it/s][A
- 46%|██████████▏           | 83/179 [00:29<00:34,  2.80it/s][A
- 47%|██████████▎           | 84/179 [00:29<00:33,  2.88it/s][A
- 47%|██████████▍           | 85/179 [00:30<00:37,  2.49it/s][A
- 48%|██████████▌           | 86/179 [00:30<00:33,  2.74it/s][A
- 49%|██████████▋           | 87/179 [00:31<00:32,  2.86it/s][A
- 49%|██████████▊           | 88/179 [00:31<00:30,  2.95it/s][A
- 50%|██████████▉           | 89/179 [00:31<00:35,  2.53it/s][A
- 50%|███████████           | 90/179 [00:32<00:32,  2.76it/s][A
- 51%|███████████▏          | 91/179 [00:32<00:30,  2.87it/s][A
- 51%|███████████▎          | 92/179 [00:32<00:29,  2.95it/s][A
- 52%|███████████▍          | 93/179 [00:33<00:34,  2.52it/s][A
- 53%|███████████▌          | 94/179 [00:33<00:30,  2.76it/s][A
- 53%|███████████▋          | 95/179 [00:33<00:29,  2.88it/s][A
- 54%|███████████▊          | 96/179 [00:34<00:28,  2.96it/s][A
- 54%|███████████▉          | 97/179 [00:34<00:32,  2.53it/s][A
- 55%|████████████          | 98/179 [00:35<00:29,  2.77it/s][A
- 55%|████████████▏         | 99/179 [00:35<00:27,  2.88it/s][A
- 56%|███████████▋         | 100/179 [00:35<00:26,  2.94it/s][A
- 56%|███████████▊         | 101/179 [00:36<00:30,  2.52it/s][A
- 57%|███████████▉         | 102/179 [00:36<00:28,  2.75it/s][A
- 58%|████████████         | 103/179 [00:36<00:26,  2.86it/s][A
- 58%|████████████▏        | 104/179 [00:37<00:25,  2.94it/s][A
- 59%|████████████▎        | 105/179 [00:37<00:29,  2.51it/s][A
- 59%|████████████▍        | 106/179 [00:37<00:26,  2.75it/s][A
- 60%|███████████���▌        | 107/179 [00:38<00:25,  2.84it/s][A
- 60%|████████████▋        | 108/179 [00:38<00:24,  2.93it/s][A
- 61%|████████████▊        | 109/179 [00:39<00:27,  2.51it/s][A
- 61%|████████████▉        | 110/179 [00:39<00:25,  2.72it/s][A
- 62%|█████████████        | 111/179 [00:39<00:23,  2.85it/s][A
- 63%|█████████████▏       | 112/179 [00:40<00:22,  2.92it/s][A
- 63%|█████████████▎       | 113/179 [00:40<00:26,  2.52it/s][A
- 64%|█████████████▎       | 114/179 [00:40<00:23,  2.74it/s][A
- 64%|█████████████▍       | 115/179 [00:41<00:22,  2.86it/s][A
- 65%|█████████████▌       | 116/179 [00:41<00:21,  2.93it/s][A
- 65%|█████████████▋       | 117/179 [00:42<00:24,  2.51it/s][A
- 66%|█████████████▊       | 118/179 [00:42<00:22,  2.74it/s][A
- 66%|█████████████▉       | 119/179 [00:42<00:21,  2.84it/s][A
- 67%|██████████████       | 120/179 [00:42<00:20,  2.94it/s][A
- 68%|██████████████▏      | 121/179 [00:43<00:23,  2.52it/s][A
- 68%|██████████████▎      | 122/179 [00:43<00:20,  2.75it/s][A
- 69%|██████████████▍      | 123/179 [00:44<00:19,  2.87it/s][A
- 69%|██████████████▌      | 124/179 [00:44<00:18,  2.95it/s][A
- 70%|██████████████▋      | 125/179 [00:44<00:21,  2.53it/s][A
- 70%|██████████████▊      | 126/179 [00:45<00:19,  2.76it/s][A
- 71%|██████████████▉      | 127/179 [00:45<00:18,  2.88it/s][A
- 72%|███████████████      | 128/179 [00:45<00:17,  2.95it/s][A
- 72%|███████████████▏     | 129/179 [00:46<00:19,  2.51it/s][A
- 73%|███████████████▎     | 130/179 [00:46<00:17,  2.73it/s][A
- 73%|███████████████▎     | 131/179 [00:46<00:16,  2.83it/s][A
- 74%|███████████████▍     | 132/179 [00:47<00:16,  2.91it/s][A
- 74%|███████████████▌     | 133/179 [00:47<00:18,  2.49it/s][A
- 75%|███████████████▋     | 134/179 [00:48<00:16,  2.72it/s][A
- 75%|███████████████▊     | 135/179 [00:48<00:15,  2.84it/s][A
- 76%|███████████████▉     | 136/179 [00:48<00:14,  2.93it/s][A
- 77%|████████████████     | 137/179 [00:49<00:16,  2.51it/s][A
- 77%|████████████████▏    | 138/179 [00:49<00:14,  2.75it/s][A
- 78%|████████████████▎    | 139/179 [00:49<00:14,  2.85it/s][A
- 78%|████████████████▍    | 140/179 [00:50<00:13,  2.93it/s][A
- 79%|████████████████▌    | 141/179 [00:50<00:15,  2.50it/s][A
- 79%|████████████████▋    | 142/179 [00:51<00:13,  2.74it/s][A
- 80%|████████████████▊    | 143/179 [00:51<00:12,  2.85it/s][A
- 80%|████████████████▉    | 144/179 [00:51<00:11,  2.93it/s][A
- 81%|█████████████████    | 145/179 [00:52<00:13,  2.50it/s][A
- 82%|█████████████████▏   | 146/179 [00:52<00:12,  2.74it/s][A
- 82%|█████████████████▏   | 147/179 [00:52<00:11,  2.85it/s][A
- 83%|█████████████████▎   | 148/179 [00:53<00:10,  2.94it/s][A
- 83%|█████████████████▍   | 149/179 [00:53<00:11,  2.52it/s][A
- 84%|█████████████████▌   | 150/179 [00:53<00:10,  2.74it/s][A
- 84%|█████████████████▋   | 151/179 [00:54<00:09,  2.84it/s][A
- 85%|█████████████████▊   | 152/179 [00:54<00:09,  2.94it/s][A
- 85%|█████████████████▉   | 153/179 [00:55<00:10,  2.52it/s][A
- 86%|██████████████████   | 154/179 [00:55<00:09,  2.76it/s][A
- 87%|██████████████████▏  | 155/179 [00:55<00:08,  2.87it/s][A
- 87%|██████████████████▎  | 156/179 [00:56<00:07,  2.93it/s][A
- 88%|██████████████████▍  | 157/179 [00:56<00:08,  2.51it/s][A
- 88%|██████████████████▌  | 158/179 [00:56<00:07,  2.73it/s][A
- 89%|██████████████████▋  | 159/179 [00:57<00:07,  2.85it/s][A
- 89%|██████████████████▊  | 160/179 [00:57<00:06,  2.94it/s][A
- 90%|██████████████████▉  | 161/179 [00:58<00:07,  2.51it/s][A
- 91%|███████████████████  | 162/179 [00:58<00:06,  2.75it/s][A
- 91%|███████████████████  | 163/179 [00:58<00:05,  2.86it/s][A
- 92%|███████████████████▏ | 164/179 [00:58<00:05,  2.93it/s][A
- 92%|███████████████████▎ | 165/179 [00:59<00:05,  2.51it/s][A
- 93%|███████████████████▍ | 166/179 [00:59<00:04,  2.72it/s][A
- 93%|███████████████████▌ | 167/179 [01:00<00:04,  2.84it/s][A
- 94%|███████████████████▋ | 168/179 [01:00<00:03,  2.92it/s][A
- 94%|███████████████████▊ | 169/179 [01:00<00:03,  2.50it/s][A
- 95%|███████████████████▉ | 170/179 [01:01<00:03,  2.73it/s][A
- 96%|████████████████████ | 171/179 [01:01<00:02,  2.83it/s][A
- 96%|████████████████████▏| 172/179 [01:01<00:02,  2.91it/s][A
- 97%|████████████████████▎| 173/179 [01:02<00:02,  2.49it/s][A
- 97%|████████████████████▍| 174/179 [01:02<00:01,  2.72it/s][A
- 98%|████████████████████▌| 175/179 [01:03<00:01,  2.83it/s][A
- 98%|████████████████████▋| 176/179 [01:03<00:01,  2.91it/s][A
- 99%|████████████████████▊| 177/179 [01:03<00:00,  2.50it/s][A
- 99%|████████████████████▉| 178/179 [01:04<00:00,  2.72it/s][A
-100%|█████████████████████| 179/179 [01:04<00:00,  2.64it/s][A                                                            
-                                                            [A{'eval_loss': 3.165417432785034, 'eval_runtime': 67.6725, 'eval_samples_per_second': 2.896, 'eval_steps_per_second': 1.448, 'memory/max_active (GiB)': 7.61, 'memory/max_allocated (GiB)': 7.61, 'memory/device_reserved (GiB)': 8.66, 'epoch': 0}
-  0%|                              | 0/1000 [01:14<?, ?it/s]
-100%|█████████████████████| 179/179 [01:04<00:00,  2.64it/s][A
-                                                            [A[2025-10-18 19:04:31,800] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42528] Loading dataset: AiAF/conversations with base_type: chat_template and prompt_style: None
-[2025-10-18 19:04:31,818] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:42528] Using chat template:
+[2026-03-30 13:38:15,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:37135] EOS: 1 / <eos>
+[2026-03-30 13:38:15,214] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:37135] BOS: 2 / <bos>
+[2026-03-30 13:38:15,215] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:37135] PAD: 0 / <pad>
+[2026-03-30 13:38:15,215] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:37135] UNK: 3 / <unk>
+[2026-03-30 13:38:15,216] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:37135] Unable to find prepared dataset in /workspace/axolotl/last_run_prepared/f493251e06461a149e3a38551d1b7982
+[2026-03-30 13:38:15,217] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37135] Loading raw datasets...
+[2026-03-30 13:38:15,217] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37135] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
+[2026-03-30 13:38:15,533] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37135] Loading dataset: . with base_type: chat_template and prompt_style: None
+[2026-03-30 13:38:15,536] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:37135] Using chat template:
 ---
 {{ bos_token }}
 {% for m in messages %}
@@ -424,1047 +232,17 @@ trainable params: 83,066,880 || all params: 2,697,408,768 || trainable%: 3.0795
 {% endif %}
 
 ---
-
-Tokenizing Prompts (num_proc=12):   0%| | 0/10000 [00:00<?, [A
-Tokenizing Prompts (num_proc=12):   8%| | 833/10000 [13:09<2[A
-Tokenizing Prompts (num_proc=12):  17%|▏| 1666/10000 [13:22<[A
-Tokenizing Prompts (num_proc=12):  25%|▎| 2500/10000 [14:02<[A
-Tokenizing Prompts (num_proc=12):  33%|▎| 3334/10000 [15:16<[A
-Tokenizing Prompts (num_proc=12):  42%|▍| 4168/10000 [15:41<[A
-Tokenizing Prompts (num_proc=12):  50%|▌| 5001/10000 [16:05<[A
-Tokenizing Prompts (num_proc=12):  58%|▌| 5834/10000 [17:50<[A
-Tokenizing Prompts (num_proc=12):  67%|▋| 6667/10000 [21:09<[A
-Tokenizing Prompts (num_proc=12):  75%|▊| 7500/10000 [23:50<[A
-Tokenizing Prompts (num_proc=12):  83%|▊| 8333/10000 [24:38<[A
-Tokenizing Prompts (num_proc=12):  92%|▉| 9166/10000 [26:51<[A
-Tokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [32:14[ATokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [32:16
-
-Dropping Long Sequences:   0%| | 0/10000 [00:00<?, ? example[A
-Dropping Long Sequences:  10%| | 1000/10000 [00:09<01:27, 10[A
-Dropping Long Sequences:  20%|▏| 2000/10000 [00:17<01:06, 12[A
-Dropping Long Sequences:  30%|▎| 3000/10000 [00:24<00:53, 12[A
-Dropping Long Sequences:  40%|▍| 4000/10000 [00:31<00:46, 13[A
-Dropping Long Sequences:  50%|▌| 5000/10000 [00:38<00:37, 13[A
-Dropping Long Sequences:  60%|▌| 6000/10000 [00:45<00:29, 13[A
-Dropping Long Sequences:  70%|▋| 7000/10000 [00:52<00:21, 14[A
-Dropping Long Sequences:  80%|▊| 8000/10000 [00:59<00:14, 14[A
-Dropping Long Sequences:  90%|▉| 9000/10000 [01:05<00:06, 14[A
-Dropping Long Sequences: 100%|█| 10000/10000 [01:12<00:00, 1[ADropping Long Sequences: 100%|█| 10000/10000 [01:12<00:00, 1
-
-Add position_id column (Pretraining Sample Packing):   0%| |[A
-Add position_id column (Pretraining Sample Packing):  42%|▍|[A
-Add position_id column (Pretraining Sample Packing):  84%|▊|[A
-Add position_id column (Pretraining Sample Packing): 100%|█|[AAdd position_id column (Pretraining Sample Packing): 100%|█|
-[2025-10-18 19:38:08,296] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42528] Using single process for pack_parallel, running sequentially.
-[2025-10-18 19:38:19,443] [WARNING] [py.warnings._showwarnmsg:110] [PID:42528] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
-  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
-
-  0%|                | 1/1000 [35:40<594:06:42, 2140.94s/it]                                                            {'loss': 3.0385, 'grad_norm': 4.033161640167236, 'learning_rate': 0.0, 'memory/max_active (GiB)': 17.28, 'memory/max_allocated (GiB)': 17.28, 'memory/device_reserved (GiB)': 19.35, 'tokens_per_second_per_gpu': 18333.79, 'epoch': 0.0}
-  0%|                | 1/1000 [35:40<594:06:42, 2140.94s/it]  0%|                 | 2/1000 [35:48<245:35:41, 885.91s/it]                                                            {'loss': 3.0598, 'grad_norm': 3.149260997772217, 'learning_rate': 6.666666666666667e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1198.51, 'epoch': 0.0}
-  0%|                 | 2/1000 [35:48<245:35:41, 885.91s/it]  0%|                 | 3/1000 [35:55<134:15:37, 484.79s/it]                                                            {'loss': 2.9327, 'grad_norm': 3.456252336502075, 'learning_rate': 1.3333333333333333e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1148.73, 'epoch': 0.0}
-  0%|                 | 3/1000 [35:55<134:15:37, 484.79s/it]  0%|                  | 4/1000 [36:03<81:59:23, 296.35s/it]                                                            {'loss': 3.048, 'grad_norm': 2.9860503673553467, 'learning_rate': 2e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 750.95, 'epoch': 0.0}
-  0%|                  | 4/1000 [36:03<81:59:23, 296.35s/it]  0%|                  | 5/1000 [36:10<53:07:06, 192.19s/it]                                                            {'loss': 2.7464, 'grad_norm': 2.023449420928955, 'learning_rate': 2.6666666666666667e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 877.1, 'epoch': 0.01}
-  0%|                  | 5/1000 [36:10<53:07:06, 192.19s/it]  1%|                  | 6/1000 [36:18<35:43:34, 129.39s/it]                                                            {'loss': 2.8976, 'grad_norm': 2.2616541385650635, 'learning_rate': 3.3333333333333335e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 600.15, 'epoch': 0.01}
-  1%|                  | 6/1000 [36:18<35:43:34, 129.39s/it]  1%|▏                  | 7/1000 [36:25<24:42:15, 89.56s/it]                                                            {'loss': 2.8685, 'grad_norm': 1.926656723022461, 'learning_rate': 4e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1162.78, 'epoch': 0.01}
-  1%|▏                  | 7/1000 [36:25<24:42:15, 89.56s/it]  1%|▏                  | 8/1000 [36:33<17:29:08, 63.46s/it]                                                            {'loss': 2.9784, 'grad_norm': 2.246783494949341, 'learning_rate': 4.666666666666667e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1222.76, 'epoch': 0.01}
-  1%|▏                  | 8/1000 [36:33<17:29:08, 63.46s/it]  1%|▏                  | 9/1000 [36:40<12:39:30, 45.98s/it]                                                            {'loss': 2.6622, 'grad_norm': 1.9021235704421997, 'learning_rate': 5.333333333333333e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1201.64, 'epoch': 0.01}
-  1%|▏                  | 9/1000 [36:40<12:39:30, 45.98s/it]  1%|▏                  | 10/1000 [36:48<9:23:13, 34.13s/it]                                                            {'loss': 2.6967, 'grad_norm': 1.279630184173584, 'learning_rate': 6e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1096.55, 'epoch': 0.01}
-  1%|▏                  | 10/1000 [36:48<9:23:13, 34.13s/it]  1%|▏                  | 11/1000 [36:56<7:08:45, 26.01s/it]                                                            {'loss': 2.707, 'grad_norm': 1.2532055377960205, 'learning_rate': 6.666666666666667e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1097.6, 'epoch': 0.01}
-  1%|▏                  | 11/1000 [36:56<7:08:45, 26.01s/it]  1%|▏                  | 12/1000 [37:03<5:35:55, 20.40s/it]                                                            {'loss': 2.9011, 'grad_norm': 1.1955878734588623, 'learning_rate': 7.333333333333333e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1142.1, 'epoch': 0.01}
-  1%|▏                  | 12/1000 [37:03<5:35:55, 20.40s/it]  1%|▏                  | 13/1000 [37:11<4:31:56, 16.53s/it]                                                            {'loss': 2.4094, 'grad_norm': 1.154872179031372, 'learning_rate': 8e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 818.65, 'epoch': 0.01}
-  1%|▏                  | 13/1000 [37:11<4:31:56, 16.53s/it]  1%|▎                  | 14/1000 [37:18<3:47:27, 13.84s/it]                                                            {'loss': 2.8629, 'grad_norm': 1.0860029458999634, 'learning_rate': 8.666666666666667e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1091.01, 'epoch': 0.01}
-  1%|▎                  | 14/1000 [37:18<3:47:27, 13.84s/it]  2%|▎                  | 15/1000 [37:26<3:16:31, 11.97s/it]                                                            {'loss': 2.7485, 'grad_norm': 1.2931734323501587, 'learning_rate': 9.333333333333334e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1144.27, 'epoch': 0.01}
-  2%|▎                  | 15/1000 [37:26<3:16:31, 11.97s/it]  2%|▎                  | 16/1000 [37:34<2:55:09, 10.68s/it]                                                            {'loss': 2.5887, 'grad_norm': 0.9242000579833984, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1236.8, 'epoch': 0.02}
-  2%|▎                  | 16/1000 [37:34<2:55:09, 10.68s/it]  2%|▎                  | 17/1000 [37:41<2:40:00,  9.77s/it]                                                            {'loss': 2.7701, 'grad_norm': 0.8560150861740112, 'learning_rate': 0.00010666666666666667, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1335.52, 'epoch': 0.02}
-  2%|▎                  | 17/1000 [37:41<2:40:00,  9.77s/it]  2%|▎                  | 18/1000 [37:49<2:29:38,  9.14s/it]                                                            {'loss': 2.962, 'grad_norm': 1.1581281423568726, 'learning_rate': 0.00011333333333333334, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 874.38, 'epoch': 0.02}
-  2%|▎                  | 18/1000 [37:49<2:29:38,  9.14s/it]  2%|▎                  | 19/1000 [37:57<2:22:16,  8.70s/it]                                                            {'loss': 2.7912, 'grad_norm': 1.2101572751998901, 'learning_rate': 0.00012, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 647.46, 'epoch': 0.02}
-  2%|▎                  | 19/1000 [37:57<2:22:16,  8.70s/it]  2%|▍                  | 20/1000 [38:04<2:17:12,  8.40s/it]                                                            {'loss': 2.7377, 'grad_norm': 1.122979998588562, 'learning_rate': 0.00012666666666666666, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 719.08, 'epoch': 0.02}
-  2%|▍                  | 20/1000 [38:04<2:17:12,  8.40s/it]  2%|▍                  | 21/1000 [38:12<2:13:42,  8.19s/it]                                                            {'loss': 2.7394, 'grad_norm': 1.2482781410217285, 'learning_rate': 0.00013333333333333334, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 612.84, 'epoch': 0.02}
-  2%|▍                  | 21/1000 [38:12<2:13:42,  8.19s/it]  2%|▍                  | 22/1000 [38:20<2:11:07,  8.04s/it]                                                            {'loss': 2.7289, 'grad_norm': 1.285197377204895, 'learning_rate': 0.00014, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 898.78, 'epoch': 0.02}
-  2%|▍                  | 22/1000 [38:20<2:11:07,  8.04s/it]  2%|▍                  | 23/1000 [38:28<2:09:16,  7.94s/it]                                                            {'loss': 2.6914, 'grad_norm': 0.9495914578437805, 'learning_rate': 0.00014666666666666666, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 982.07, 'epoch': 0.02}
-  2%|▍                  | 23/1000 [38:28<2:09:16,  7.94s/it]  2%|▍                  | 24/1000 [38:35<2:07:55,  7.86s/it]                                                            {'loss': 2.7317, 'grad_norm': 1.0743474960327148, 'learning_rate': 0.00015333333333333334, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 695.3, 'epoch': 0.02}
-  2%|▍                  | 24/1000 [38:35<2:07:55,  7.86s/it]  2%|▍                  | 25/1000 [38:43<2:06:57,  7.81s/it]                                                            {'loss': 2.7183, 'grad_norm': 0.8562339544296265, 'learning_rate': 0.00016, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1103.02, 'epoch': 0.03}
-  2%|▍                  | 25/1000 [38:43<2:06:57,  7.81s/it]  3%|▍                  | 26/1000 [38:51<2:06:19,  7.78s/it]                                                            {'loss': 2.5957, 'grad_norm': 0.8150955438613892, 'learning_rate': 0.0001666666666666667, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1097.5, 'epoch': 0.03}
-  3%|▍                  | 26/1000 [38:51<2:06:19,  7.78s/it]  3%|▌                  | 27/1000 [38:58<2:05:50,  7.76s/it]                                                            {'loss': 2.7314, 'grad_norm': 0.7931953072547913, 'learning_rate': 0.00017333333333333334, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1172.38, 'epoch': 0.03}
-  3%|▌                  | 27/1000 [38:58<2:05:50,  7.76s/it]  3%|▌                  | 28/1000 [39:06<2:05:30,  7.75s/it]                                                            {'loss': 2.7512, 'grad_norm': 0.834540069103241, 'learning_rate': 0.00018, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1181.07, 'epoch': 0.03}
-  3%|▌                  | 28/1000 [39:06<2:05:30,  7.75s/it]  3%|▌                  | 29/1000 [39:14<2:05:09,  7.73s/it]                                                            {'loss': 2.6324, 'grad_norm': 0.8404369950294495, 'learning_rate': 0.0001866666666666667, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 922.09, 'epoch': 0.03}
-  3%|▌                  | 29/1000 [39:14<2:05:09,  7.73s/it]  3%|▌                  | 30/1000 [39:22<2:04:57,  7.73s/it]                                                            {'loss': 2.6468, 'grad_norm': 0.9338690042495728, 'learning_rate': 0.00019333333333333333, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 726.25, 'epoch': 0.03}
-  3%|▌                  | 30/1000 [39:22<2:04:57,  7.73s/it]  3%|▌                  | 31/1000 [39:29<2:04:41,  7.72s/it]                                                            {'loss': 2.6433, 'grad_norm': 0.9352913498878479, 'learning_rate': 0.0002, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 689.75, 'epoch': 0.03}
-  3%|▌                  | 31/1000 [39:29<2:04:41,  7.72s/it]  3%|▌                  | 32/1000 [39:37<2:04:23,  7.71s/it]                                                            {'loss': 2.5828, 'grad_norm': 0.8315832018852234, 'learning_rate': 0.00019999947552365961, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1009.02, 'epoch': 0.03}
-  3%|▌                  | 32/1000 [39:37<2:04:23,  7.71s/it]  3%|▋                  | 33/1000 [39:45<2:04:23,  7.72s/it]                                                            {'loss': 2.6666, 'grad_norm': 0.7479775547981262, 'learning_rate': 0.00019999790210013988, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1307.23, 'epoch': 0.03}
-  3%|▋                  | 33/1000 [39:45<2:04:23,  7.72s/it]  3%|▋                  | 34/1000 [39:52<2:04:05,  7.71s/it]                                                            {'loss': 2.708, 'grad_norm': 0.8287420272827148, 'learning_rate': 0.0001999952797459453, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 888.52, 'epoch': 0.03}
-  3%|▋                  | 34/1000 [39:52<2:04:05,  7.71s/it]  4%|▋                  | 35/1000 [40:00<2:03:56,  7.71s/it]                                                            {'loss': 2.6907, 'grad_norm': 0.9301550388336182, 'learning_rate': 0.0001999916084885832, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 724.52, 'epoch': 0.04}
-  4%|▋                  | 35/1000 [40:00<2:03:56,  7.71s/it]  4%|▋                  | 36/1000 [40:08<2:03:53,  7.71s/it]                                                            {'loss': 2.706, 'grad_norm': 1.0113674402236938, 'learning_rate': 0.00019998688836656323, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 621.34, 'epoch': 0.04}
-  4%|▋                  | 36/1000 [40:08<2:03:53,  7.71s/it]  4%|▋                  | 37/1000 [40:15<2:03:40,  7.71s/it]                                                            {'loss': 2.8472, 'grad_norm': 0.8708809614181519, 'learning_rate': 0.0001999811194293973, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1073.94, 'epoch': 0.04}
-  4%|▋                  | 37/1000 [40:15<2:03:40,  7.71s/it]  4%|▋                  | 38/1000 [40:23<2:03:29,  7.70s/it]                                                            {'loss': 2.6066, 'grad_norm': 0.8908202648162842, 'learning_rate': 0.00019997430173759875, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1081.11, 'epoch': 0.04}
-  4%|▋                  | 38/1000 [40:23<2:03:29,  7.70s/it]  4%|▋                  | 39/1000 [40:31<2:03:22,  7.70s/it]                                                            {'loss': 2.6677, 'grad_norm': 0.9377694725990295, 'learning_rate': 0.00019996643536268204, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 672.27, 'epoch': 0.04}
-  4%|▋                  | 39/1000 [40:31<2:03:22,  7.70s/it]  4%|▊                  | 40/1000 [40:39<2:03:17,  7.71s/it]                                                            {'loss': 2.7082, 'grad_norm': 0.7817007899284363, 'learning_rate': 0.00019995752038716168, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1068.3, 'epoch': 0.04}
-  4%|▊                  | 40/1000 [40:39<2:03:17,  7.71s/it]  4%|▊                  | 41/1000 [40:46<2:03:04,  7.70s/it]                                                            {'loss': 2.6364, 'grad_norm': 0.8867801427841187, 'learning_rate': 0.00019994755690455152, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 771.28, 'epoch': 0.04}
-  4%|▊                  | 41/1000 [40:46<2:03:04,  7.70s/it]  4%|▊                  | 42/1000 [40:54<2:03:02,  7.71s/it]                                                            {'loss': 2.671, 'grad_norm': 0.8036760091781616, 'learning_rate': 0.0001999365450193638, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 936.84, 'epoch': 0.04}
-  4%|▊                  | 42/1000 [40:54<2:03:02,  7.71s/it]  4%|▊                  | 43/1000 [41:02<2:02:55,  7.71s/it]                                                            {'loss': 2.8347, 'grad_norm': 0.8083842992782593, 'learning_rate': 0.00019992448484710797, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 879.42, 'epoch': 0.04}
-  4%|▊                  | 43/1000 [41:02<2:02:55,  7.71s/it]  4%|▊                  | 44/1000 [41:09<2:02:44,  7.70s/it]                                                            {'loss': 2.7744, 'grad_norm': 0.9920223355293274, 'learning_rate': 0.00019991137651428957, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 593.51, 'epoch': 0.04}
-  4%|▊                  | 44/1000 [41:09<2:02:44,  7.70s/it]  4%|▊                  | 45/1000 [41:17<2:02:38,  7.71s/it]                                                            {'loss': 2.6234, 'grad_norm': 0.8331083655357361, 'learning_rate': 0.0001998972201584088, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 925.27, 'epoch': 0.04}
-  4%|▊                  | 45/1000 [41:17<2:02:38,  7.71s/it]  5%|▊                  | 46/1000 [41:25<2:02:38,  7.71s/it]                                                            {'loss': 2.7897, 'grad_norm': 0.8912681341171265, 'learning_rate': 0.0001998820159279591, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 847.7, 'epoch': 0.05}
-  5%|▊                  | 46/1000 [41:25<2:02:38,  7.71s/it]  5%|▉                  | 47/1000 [41:33<2:02:28,  7.71s/it]                                                            {'loss': 2.9623, 'grad_norm': 1.0203715562820435, 'learning_rate': 0.00019986576398242566, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 746.18, 'epoch': 0.05}
-  5%|▉                  | 47/1000 [41:33<2:02:28,  7.71s/it]  5%|▉                  | 48/1000 [41:40<2:02:16,  7.71s/it]                                                            {'loss': 2.5059, 'grad_norm': 0.7893779873847961, 'learning_rate': 0.0001998484644922837, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1069.71, 'epoch': 0.05}
-  5%|▉                  | 48/1000 [41:40<2:02:16,  7.71s/it]  5%|▉                  | 49/1000 [41:48<2:02:06,  7.70s/it]                                                            {'loss': 2.5484, 'grad_norm': 0.7218884229660034, 'learning_rate': 0.00019983011763899673, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1185.31, 'epoch': 0.05}
-  5%|▉                  | 49/1000 [41:48<2:02:06,  7.70s/it]  5%|▉                  | 50/1000 [41:56<2:02:08,  7.71s/it]                                                            {'loss': 2.7377, 'grad_norm': 0.7844395041465759, 'learning_rate': 0.0001998107236150145, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.75, 'tokens_per_second_per_gpu': 1081.63, 'epoch': 0.05}
-  5%|▉                  | 50/1000 [41:56<2:02:08,  7.71s/it][2025-10-18 19:44:42,774] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 19:44:45,590] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3574237823486328
-[2025-10-18 19:44:46,947] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3563930988311768
-[2025-10-18 19:44:48,307] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3596258163452148
-[2025-10-18 19:44:49,675] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3678362369537354
-[2025-10-18 19:44:49,675] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:28,  6.17it/s][A
-  2%|▍                      | 3/179 [00:00<00:40,  4.35it/s][A
-  2%|▌                      | 4/179 [00:00<00:46,  3.78it/s][A
-  3%|▋                      | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                      | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|▉                      | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█                      | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▏                     | 9/179 [00:03<01:10,  2.42it/s][A
-  6%|█▏                    | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▎                    | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▍                    | 12/179 [00:04<00:58,  2.87it/s][A
-  7%|█▌                    | 13/179 [00:04<01:07,  2.46it/s][A
-  8%|█▋                    | 14/179 [00:05<01:01,  2.69it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.80it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██                    | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▏                   | 18/179 [00:06<01:00,  2.68it/s][A
- 11%|██▎                   | 19/179 [00:06<00:57,  2.78it/s][A
- 11%|██▍                   | 20/179 [00:07<00:55,  2.86it/s][A
- 12%|██▌                   | 21/179 [00:07<01:03,  2.47it/s][A
- 12%|██▋                   | 22/179 [00:07<00:58,  2.68it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|██▉                   | 24/179 [00:08<00:53,  2.87it/s][A
- 14%|███                   | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▏                  | 26/179 [00:09<00:57,  2.68it/s][A
- 15%|███▎                  | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▍                  | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|███▌                  | 29/179 [00:10<01:00,  2.46it/s][A
- 17%|███▋                  | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|███▊                  | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|███▉                  | 32/179 [00:11<00:51,  2.87it/s][A
- 18%|████                  | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▏                 | 34/179 [00:12<00:54,  2.68it/s][A
- 20%|████▎                 | 35/179 [00:12<00:51,  2.78it/s][A
- 20%|████▍                 | 36/179 [00:13<00:50,  2.86it/s][A
- 21%|████▌                 | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|████▋                 | 38/179 [00:13<00:52,  2.69it/s][A
- 22%|████▊                 | 39/179 [00:14<00:49,  2.80it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.87it/s][A
- 23%|█████                 | 41/179 [00:15<00:55,  2.47it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.79it/s][A
- 25%|█████▍                | 44/179 [00:16<00:47,  2.86it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|█████▊                | 47/179 [00:17<00:47,  2.80it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.88it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.48it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.70it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.80it/s][A
- 29%|██████▍               | 52/179 [00:19<00:44,  2.86it/s][A
- 30%|██████▌               | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.78it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.46it/s][A
- 32%|███████▏              | 58/179 [00:21<00:45,  2.68it/s][A
- 33%|███████▎              | 59/179 [00:21<00:43,  2.79it/s][A
- 34%|███████▎              | 60/179 [00:22<00:41,  2.86it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.80it/s][A
- 36%|███████▊              | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|███████▉              | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|████████              | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|████████▏             | 67/179 [00:24<00:40,  2.80it/s][A
- 38%|████████▎             | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.46it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|████████▉             | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|█████████             | 74/179 [00:27<00:38,  2.69it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:41,  2.48it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.47it/s][A
- 46%|██████████            | 82/179 [00:30<00:36,  2.69it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:38,  2.47it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:32,  2.81it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:31,  2.88it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:36,  2.48it/s][A
- 50%|███████████           | 90/179 [00:33<00:32,  2.71it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.81it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.88it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:34,  2.47it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.70it/s][A
- 53%|███████████▋          | 95/179 [00:35<00:30,  2.79it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:28,  2.87it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.48it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.69it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.79it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|████████████         | 103/179 [00:37<00:27,  2.80it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|████████████▌        | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.87it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:28,  2.45it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.70it/s][A
- 62%|█████████████        | 111/179 [00:40<00:24,  2.81it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.89it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.70it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:22,  2.81it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:21,  2.89it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.47it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|█████████████▉       | 119/179 [00:43<00:21,  2.78it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.86it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.68it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.80it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.88it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.78it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.86it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.46it/s][A
- 75%|███���███████████▋     | 134/179 [00:49<00:16,  2.68it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|███████████████▉     | 136/179 [00:50<00:14,  2.87it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.70it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.79it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.86it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.44it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.68it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.44it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.46it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:10,  2.78it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|██████████████████   | 154/179 [00:56<00:09,  2.68it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.78it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:08,  2.86it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.68it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.79it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.88it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.48it/s][A
- 91%|███████████████████  | 162/179 [00:59<00:06,  2.71it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.81it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.89it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.79it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.87it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.79it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.69it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.62it/s][A                                                            
-                                                            [A{'eval_loss': 2.5978446006774902, 'eval_runtime': 68.7115, 'eval_samples_per_second': 2.853, 'eval_steps_per_second': 1.426, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.75, 'epoch': 0.05}
-  5%|▉                  | 50/1000 [43:11<2:02:08,  7.71s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.62it/s][A
-                                                            [A[2025-10-18 19:45:58,394] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-50
-  5%|▉                  | 51/1000 [43:21<8:12:20, 31.13s/it]                                                            {'loss': 2.6611, 'grad_norm': 0.7160661220550537, 'learning_rate': 0.00019979028262377118, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1114.08, 'epoch': 0.05}
-  5%|▉                  | 51/1000 [43:21<8:12:20, 31.13s/it]  5%|▉                  | 52/1000 [43:29<6:20:46, 24.10s/it]                                                            {'loss': 2.8051, 'grad_norm': 0.8069626092910767, 'learning_rate': 0.0001997687948796831, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 849.93, 'epoch': 0.05}
-  5%|▉                  | 52/1000 [43:29<6:20:46, 24.10s/it]  5%| | 53/1000 [43:37<5:02:36, 19.17s/it                                                            {'loss': 2.7554, 'grad_norm': 0.7512195706367493, 'learning_rate': 0.00019974626060814647, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 955.18, 'epoch': 0.05}
-  5%| | 53/1000 [43:37<5:02:36, 19.17s/it  5%| | 54/1000 [43:44<4:08:00, 15.73s/it                                         {'loss': 2.6504, 'grad_norm': 0.7488393783569336, 'learning_rate': 0.0001997226800455352, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 954.53, 'epoch': 0.05}
-  5%| | 54/1000 [43:44<4:08:00, 15.73s/it  6%| | 55/1000 [43:52<3:29:47, 13.32s/it                                         {'loss': 2.7668, 'grad_norm': 0.7308831214904785, 'learning_rate': 0.00019969805343919821, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1108.89, 'epoch': 0.06}
-  6%| | 55/1000 [43:52<3:29:47, 13.32s/it  6%| | 56/1000 [44:00<3:03:06, 11.64s/it                                         {'loss': 2.6415, 'grad_norm': 0.8551144003868103, 'learning_rate': 0.00019967238104745696, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 773.68, 'epoch': 0.06}
-  6%| | 56/1000 [44:00<3:03:06, 11.64s/it  6%| | 57/1000 [44:08<2:44:16, 10.45s/it                                         {'loss': 2.726, 'grad_norm': 0.811182975769043, 'learning_rate': 0.00019964566313960264, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 921.12, 'epoch': 0.06}
-  6%| | 57/1000 [44:08<2:44:16, 10.45s/it  6%| | 58/1000 [44:15<2:31:09,  9.63s/it                                         {'loss': 2.6357, 'grad_norm': 0.7899704575538635, 'learning_rate': 0.00019961789999589356, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 974.41, 'epoch': 0.06}
-  6%| | 58/1000 [44:15<2:31:09,  9.63s/it  6%| | 59/1000 [44:23<2:21:58,  9.05s/it                                         {'loss': 2.6244, 'grad_norm': 1.0924081802368164, 'learning_rate': 0.00019958909190755187, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 475.12, 'epoch': 0.06}
-  6%| | 59/1000 [44:23<2:21:58,  9.05s/it  6%| | 60/1000 [44:31<2:15:28,  8.65s/it                                         {'loss': 2.6244, 'grad_norm': 0.88262540102005, 'learning_rate': 0.0001995592391767608, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 690.56, 'epoch': 0.06}
-  6%| | 60/1000 [44:31<2:15:28,  8.65s/it  6%| | 61/1000 [44:38<2:10:58,  8.37s/it                                         {'loss': 2.6641, 'grad_norm': 0.734322190284729, 'learning_rate': 0.0001995283421166614, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1107.76, 'epoch': 0.06}
-  6%| | 61/1000 [44:38<2:10:58,  8.37s/it  6%| | 62/1000 [44:46<2:07:44,  8.17s/it                                         {'loss': 2.7376, 'grad_norm': 1.119425892829895, 'learning_rate': 0.00019949640105134918, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 420.2, 'epoch': 0.06}
-  6%| | 62/1000 [44:46<2:07:44,  8.17s/it  6%| | 63/1000 [44:54<2:05:17,  8.02s/it                                         {'loss': 2.5299, 'grad_norm': 0.938359260559082, 'learning_rate': 0.00019946341631587087, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 647.24, 'epoch': 0.06}
-  6%| | 63/1000 [44:54<2:05:17,  8.02s/it  6%| | 64/1000 [45:02<2:03:39,  7.93s/it                                         {'loss': 2.8222, 'grad_norm': 0.9427067637443542, 'learning_rate': 0.00019942938825622065, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 769.19, 'epoch': 0.06}
-  6%| | 64/1000 [45:02<2:03:39,  7.93s/it  6%| | 65/1000 [45:09<2:02:27,  7.86s/it                                         {'loss': 2.5493, 'grad_norm': 0.9020629525184631, 'learning_rate': 0.0001993943172293368, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 711.24, 'epoch': 0.07}
-  6%| | 65/1000 [45:09<2:02:27,  7.86s/it  7%| | 66/1000 [45:17<2:01:34,  7.81s/it                                         {'loss': 2.6981, 'grad_norm': 0.8587468266487122, 'learning_rate': 0.00019935820360309777, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 780.31, 'epoch': 0.07}
-  7%| | 66/1000 [45:17<2:01:34,  7.81s/it  7%| | 67/1000 [45:25<2:01:07,  7.79s/it                                         {'loss': 2.5964, 'grad_norm': 0.9086309671401978, 'learning_rate': 0.00019932104775631846, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 741.03, 'epoch': 0.07}
-  7%| | 67/1000 [45:25<2:01:07,  7.79s/it  7%| | 68/1000 [45:32<2:00:31,  7.76s/it                                         {'loss': 2.6434, 'grad_norm': 1.013075351715088, 'learning_rate': 0.0001992828500787461, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 642.95, 'epoch': 0.07}
-  7%| | 68/1000 [45:32<2:00:31,  7.76s/it  7%| | 69/1000 [45:40<2:00:03,  7.74s/it                                         {'loss': 2.4197, 'grad_norm': 0.8920902609825134, 'learning_rate': 0.00019924361097105623, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 672.71, 'epoch': 0.07}
-  7%| | 69/1000 [45:40<2:00:03,  7.74s/it  7%| | 70/1000 [45:48<1:59:42,  7.72s/it                                         {'loss': 2.541, 'grad_norm': 0.8855428099632263, 'learning_rate': 0.00019920333084484857, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 823.24, 'epoch': 0.07}
-  7%| | 70/1000 [45:48<1:59:42,  7.72s/it  7%| | 71/1000 [45:55<1:59:29,  7.72s/it                                         {'loss': 2.5695, 'grad_norm': 0.7850986123085022, 'learning_rate': 0.00019916201012264254, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 986.0, 'epoch': 0.07}
-  7%| | 71/1000 [45:55<1:59:29,  7.72s/it  7%| | 72/1000 [46:03<1:59:15,  7.71s/it                                         {'loss': 2.5996, 'grad_norm': 0.8797448873519897, 'learning_rate': 0.00019911964923787295, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 718.35, 'epoch': 0.07}
-  7%| | 72/1000 [46:03<1:59:15,  7.71s/it  7%| | 73/1000 [46:11<1:59:05,  7.71s/it                                         {'loss': 2.5697, 'grad_norm': 0.755560576915741, 'learning_rate': 0.0001990762486348855, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 909.79, 'epoch': 0.07}
-  7%| | 73/1000 [46:11<1:59:05,  7.71s/it  7%| | 74/1000 [46:19<1:58:58,  7.71s/it                                         {'loss': 2.4671, 'grad_norm': 0.7678999900817871, 'learning_rate': 0.00019903180876893194, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 933.01, 'epoch': 0.07}
-  7%| | 74/1000 [46:19<1:58:58,  7.71s/it  8%| | 75/1000 [46:26<1:58:49,  7.71s/it                                         {'loss': 2.3965, 'grad_norm': 0.7556151747703552, 'learning_rate': 0.00019898633010616542, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 893.52, 'epoch': 0.07}
-  8%| | 75/1000 [46:26<1:58:49,  7.71s/it  8%| | 76/1000 [46:34<1:58:39,  7.70s/it                                         {'loss': 2.6524, 'grad_norm': 0.7555602788925171, 'learning_rate': 0.00019893981312363562, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1094.2, 'epoch': 0.08}
-  8%| | 76/1000 [46:34<1:58:39,  7.70s/it  8%| | 77/1000 [46:42<1:58:34,  7.71s/it                                         {'loss': 2.4937, 'grad_norm': 0.7781569957733154, 'learning_rate': 0.00019889225830928365, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1158.65, 'epoch': 0.08}
-  8%| | 77/1000 [46:42<1:58:34,  7.71s/it  8%| | 78/1000 [46:49<1:58:17,  7.70s/it                                         {'loss': 2.6399, 'grad_norm': 0.9108109474182129, 'learning_rate': 0.00019884366616193706, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 718.86, 'epoch': 0.08}
-  8%| | 78/1000 [46:49<1:58:17,  7.70s/it  8%| | 79/1000 [46:57<1:58:12,  7.70s/it                                         {'loss': 2.6243, 'grad_norm': 1.0023678541183472, 'learning_rate': 0.0001987940371913044, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 532.64, 'epoch': 0.08}
-  8%| | 79/1000 [46:57<1:58:12,  7.70s/it  8%| | 80/1000 [47:05<1:58:01,  7.70s/it                                         {'loss': 2.5985, 'grad_norm': 0.7970336079597473, 'learning_rate': 0.0001987433719179702, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 945.6, 'epoch': 0.08}
-  8%| | 80/1000 [47:05<1:58:01,  7.70s/it  8%| | 81/1000 [47:12<1:57:55,  7.70s/it                                         {'loss': 2.7574, 'grad_norm': 0.8040705323219299, 'learning_rate': 0.00019869167087338907, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 985.28, 'epoch': 0.08}
-  8%| | 81/1000 [47:12<1:57:55,  7.70s/it  8%| | 82/1000 [47:20<1:57:47,  7.70s/it                                         {'loss': 2.5813, 'grad_norm': 0.8781325817108154, 'learning_rate': 0.00019863893459988062, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 722.58, 'epoch': 0.08}
-  8%| | 82/1000 [47:20<1:57:47,  7.70s/it  8%| | 83/1000 [47:28<1:57:41,  7.70s/it                                         {'loss': 2.782, 'grad_norm': 0.9233244061470032, 'learning_rate': 0.00019858516365062334, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 784.85, 'epoch': 0.08}
-  8%| | 83/1000 [47:28<1:57:41,  7.70s/it  8%| | 84/1000 [47:36<1:57:31,  7.70s/it                                         {'loss': 2.5425, 'grad_norm': 0.7857173085212708, 'learning_rate': 0.00019853035858964906, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 945.81, 'epoch': 0.08}
-  8%| | 84/1000 [47:36<1:57:31,  7.70s/it  8%| | 85/1000 [47:43<1:57:25,  7.70s/it                                         {'loss': 2.7255, 'grad_norm': 0.772782564163208, 'learning_rate': 0.00019847451999183694, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1003.59, 'epoch': 0.09}
-  8%| | 85/1000 [47:43<1:57:25,  7.70s/it  9%| | 86/1000 [47:51<1:57:20,  7.70s/it                                         {'loss': 2.6873, 'grad_norm': 0.8894524574279785, 'learning_rate': 0.00019841764844290744, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 780.4, 'epoch': 0.09}
-  9%| | 86/1000 [47:51<1:57:20,  7.70s/it  9%| | 87/1000 [47:59<1:57:04,  7.69s/it                                         {'loss': 2.665, 'grad_norm': 0.8725907802581787, 'learning_rate': 0.0001983597445394162, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 682.99, 'epoch': 0.09}
-  9%| | 87/1000 [47:59<1:57:04,  7.69s/it  9%| | 88/1000 [48:06<1:57:00,  7.70s/it                                         {'loss': 2.4541, 'grad_norm': 0.949134111404419, 'learning_rate': 0.00019830080888874778, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 702.63, 'epoch': 0.09}
-  9%| | 88/1000 [48:06<1:57:00,  7.70s/it  9%| | 89/1000 [48:14<1:56:55,  7.70s/it                                         {'loss': 2.7143, 'grad_norm': 0.8840274214744568, 'learning_rate': 0.00019824084210910925, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 814.21, 'epoch': 0.09}
-  9%| | 89/1000 [48:14<1:56:55,  7.70s/it  9%| | 90/1000 [48:22<1:56:48,  7.70s/it                                         {'loss': 2.6434, 'grad_norm': 0.780836284160614, 'learning_rate': 0.00019817984482952376, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 965.92, 'epoch': 0.09}
-  9%| | 90/1000 [48:22<1:56:48,  7.70s/it  9%| | 91/1000 [48:29<1:56:48,  7.71s/it                                         {'loss': 2.6004, 'grad_norm': 0.7321323156356812, 'learning_rate': 0.0001981178176898239, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1083.02, 'epoch': 0.09}
-  9%| | 91/1000 [48:29<1:56:48,  7.71s/it  9%| | 92/1000 [48:37<1:56:35,  7.70s/it                                         {'loss': 2.5644, 'grad_norm': 0.7658954858779907, 'learning_rate': 0.00019805476134064507, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1171.59, 'epoch': 0.09}
-  9%| | 92/1000 [48:37<1:56:35,  7.70s/it  9%| | 93/1000 [48:45<1:56:28,  7.71s/it                                         {'loss': 2.5268, 'grad_norm': 0.7415146231651306, 'learning_rate': 0.00019799067644341844, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1053.93, 'epoch': 0.09}
-  9%| | 93/1000 [48:45<1:56:28,  7.71s/it  9%| | 94/1000 [48:53<1:56:15,  7.70s/it                                         {'loss': 2.3965, 'grad_norm': 0.7425378561019897, 'learning_rate': 0.00019792556367036432, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1014.15, 'epoch': 0.09}
-  9%| | 94/1000 [48:53<1:56:15,  7.70s/it 10%| | 95/1000 [49:00<1:56:08,  7.70s/it                                         {'loss': 2.6218, 'grad_norm': 0.7970593571662903, 'learning_rate': 0.0001978594237044849, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1033.25, 'epoch': 0.1}
- 10%| | 95/1000 [49:00<1:56:08,  7.70s/it 10%| | 96/1000 [49:08<1:56:02,  7.70s/it                                         {'loss': 2.5781, 'grad_norm': 0.761059582233429, 'learning_rate': 0.00019779225723955707, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1032.59, 'epoch': 0.1}
- 10%| | 96/1000 [49:08<1:56:02,  7.70s/it 10%| | 97/1000 [49:16<1:55:48,  7.69s/it                                         {'loss': 2.4825, 'grad_norm': 0.9234409928321838, 'learning_rate': 0.0001977240649801253, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 625.25, 'epoch': 0.1}
- 10%| | 97/1000 [49:16<1:55:48,  7.69s/it 10%| | 98/1000 [49:23<1:55:43,  7.70s/it                                         {'loss': 2.6297, 'grad_norm': 0.9856524467468262, 'learning_rate': 0.00019765484764149415, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 573.75, 'epoch': 0.1}
- 10%| | 98/1000 [49:23<1:55:43,  7.70s/it 10%| | 99/1000 [49:31<1:55:48,  7.71s/it                                         {'loss': 2.7264, 'grad_norm': 0.764410674571991, 'learning_rate': 0.00019758460594972068, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1097.38, 'epoch': 0.1}
- 10%| | 99/1000 [49:31<1:55:48,  7.71s/it 10%| | 100/1000 [49:39<1:55:30,  7.70s/i                                         {'loss': 2.3997, 'grad_norm': 0.8525063991546631, 'learning_rate': 0.00019751334064160706, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 744.01, 'epoch': 0.1}
- 10%| | 100/1000 [49:39<1:55:30,  7.70s/i[2025-10-18 19:52:25,856] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 19:52:28,836] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4034972190856934
-[2025-10-18 19:52:30,257] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4211218357086182
-[2025-10-18 19:52:31,652] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3944928646087646
-[2025-10-18 19:52:33,103] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4504680633544922
-[2025-10-18 19:52:33,103] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|            | 0/179 [00:00<?, ?it/s][A
-  1%|    | 2/179 [00:00<00:29,  6.05it/s][A
-  2%|    | 3/179 [00:00<00:40,  4.35it/s][A
-  2%|    | 4/179 [00:00<00:46,  3.77it/s][A
-  3%|    | 5/179 [00:01<01:24,  2.06it/s][A
-  3%|▏   | 6/179 [00:02<01:11,  2.42it/s][A
-  4%|▏   | 7/179 [00:02<01:06,  2.60it/s][A
-  4%|▏   | 8/179 [00:02<01:02,  2.73it/s][A
-  5%|▏   | 9/179 [00:03<01:11,  2.39it/s][A
-  6%|▏  | 10/179 [00:03<01:04,  2.62it/s][A
-  6%|▏  | 11/179 [00:03<01:01,  2.73it/s][A
-  7%|▏  | 12/179 [00:04<00:59,  2.82it/s][A
-  7%|▏  | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|▏  | 14/179 [00:05<01:01,  2.67it/s][A
-  8%|▎  | 15/179 [00:05<00:59,  2.78it/s][A
-  9%|▎  | 16/179 [00:05<00:57,  2.86it/s][A
-  9%|▎  | 17/179 [00:06<01:06,  2.45it/s][A
- 10%|▎  | 18/179 [00:06<01:00,  2.68it/s][A
- 11%|▎  | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|▎  | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|▎  | 21/179 [00:07<01:04,  2.45it/s][A
- 12%|▎  | 22/179 [00:08<00:58,  2.69it/s][A
- 13%|▍  | 23/179 [00:08<00:55,  2.79it/s][A
- 13%|▍  | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|▍  | 25/179 [00:09<01:02,  2.46it/s][A
- 15%|▍  | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|▍  | 27/179 [00:09<00:54,  2.78it/s][A
- 16%|▍  | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|▍  | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|▌  | 30/179 [00:11<00:55,  2.70it/s][A
- 17%|▌  | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|▌  | 32/179 [00:11<00:51,  2.87it/s][A
- 18%|▌  | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|▌  | 34/179 [00:12<00:54,  2.68it/s][A
- 20%|▌  | 35/179 [00:12<00:51,  2.79it/s][A
- 20%|▌  | 36/179 [00:13<00:49,  2.87it/s][A
- 21%|▌  | 37/179 [00:13<00:57,  2.48it/s][A
- 21%|▋  | 38/179 [00:14<00:52,  2.70it/s][A
- 22%|▋  | 39/179 [00:14<00:49,  2.80it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.88it/s][A
- 23%|█████                 | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.71it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.81it/s][A
- 25%|█████▍                | 44/179 [00:16<00:46,  2.88it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.71it/s][A
- 26%|█████▊                | 47/179 [00:17<00:46,  2.81it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.89it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.48it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.70it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.80it/s][A
- 29%|██████▍               | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|██████▌               | 53/179 [00:19<00:50,  2.48it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.70it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.81it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.88it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|███████▏              | 58/179 [00:21<00:44,  2.69it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.80it/s][A
- 34%|███████▎              | 60/179 [00:22<00:41,  2.87it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.70it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.80it/s][A
- 36%|███████▊              | 64/179 [00:23<00:39,  2.89it/s][A
- 36%|███████▉              | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|████████              | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|████████▏             | 67/179 [00:24<00:40,  2.80it/s][A
- 38%|████████▎             | 68/179 [00:25<00:38,  2.88it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.79it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|████████▉             | 73/179 [00:27<00:42,  2.47it/s][A
- 41%|█████████             | 74/179 [00:27<00:38,  2.69it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.79it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|���████████▉            | 81/179 [00:30<00:39,  2.47it/s][A
- 46%|██████████            | 82/179 [00:30<00:36,  2.69it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:38,  2.46it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:32,  2.79it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:31,  2.87it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|███████████           | 90/179 [00:33<00:32,  2.70it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.80it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.87it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:34,  2.47it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.69it/s][A
- 53%|███████████▋          | 95/179 [00:35<00:30,  2.79it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:29,  2.85it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.46it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.67it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.78it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.83it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.45it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.68it/s][A
- 58%|████████████         | 103/179 [00:38<00:27,  2.79it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.87it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|████████████▌        | 107/179 [00:39<00:25,  2.77it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.85it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:28,  2.45it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.67it/s][A
- 62%|█████████████        | 111/179 [00:41<00:24,  2.78it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.84it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.68it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:23,  2.78it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:22,  2.85it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|█████████████▉       | 119/179 [00:44<00:21,  2.79it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.46it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.70it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:19,  2.81it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.88it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:21,  2.48it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.70it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.81it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.88it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.48it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.70it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.81it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|███████████████▋     | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|████████████���██▉     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.44it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.67it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.77it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.85it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.46it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.86it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.45it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.68it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.45it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:10,  2.78it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.46it/s][A
- 86%|██████████████████   | 154/179 [00:57<00:09,  2.69it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.79it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:08,  2.87it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.45it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.89it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|███████████████████  | 162/179 [01:00<00:06,  2.69it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.88it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.68it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.71it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.79it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.87it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.87it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.69it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.50it/s][A                                         
-                                                            [A{'eval_loss': 2.5591788291931152, 'eval_runtime': 68.8733, 'eval_samples_per_second': 2.846, 'eval_steps_per_second': 1.423, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.1}
- 10%|█▊                | 100/1000 [50:55<1:55:30,  7.70s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.50it/s][A
-                                                            [A[2025-10-18 19:53:41,985] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-100
- 10%|█▊                | 101/1000 [51:05<7:48:36, 31.28s/it]                                                            {'loss': 2.4983, 'grad_norm': 0.9012774229049683, 'learning_rate': 0.00019744105246469263, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 660.6, 'epoch': 0.1}
- 10%|█▊                | 101/1000 [51:05<7:48:36, 31.28s/it] 10%|█▊                | 102/1000 [51:13<6:02:08, 24.20s/it]                                                            {'loss': 2.7862, 'grad_norm': 0.8785008192062378, 'learning_rate': 0.00019736774217724614, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 992.65, 'epoch': 0.1}
- 10%|█▊                | 102/1000 [51:13<6:02:08, 24.20s/it] 10%|█▊                | 103/1000 [51:20<4:47:34, 19.24s/it]                                                            {'loss': 2.6536, 'grad_norm': 0.9388639330863953, 'learning_rate': 0.00019729341054825782, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 790.17, 'epoch': 0.1}
- 10%|█▊                | 103/1000 [51:20<4:47:34, 19.24s/it] 10%|█▊                | 104/1000 [51:28<3:55:29, 15.77s/it]                                                            {'loss': 2.6323, 'grad_norm': 1.2848340272903442, 'learning_rate': 0.00019721805835743134, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 498.85, 'epoch': 0.1}
- 10%|█▊                | 104/1000 [51:28<3:55:29, 15.77s/it] 10%|█▉                | 105/1000 [51:36<3:18:57, 13.34s/it]                                                            {'loss': 2.9617, 'grad_norm': 0.82762211561203, 'learning_rate': 0.00019714168639517544, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 802.9, 'epoch': 0.1}
- 10%|█▉                | 105/1000 [51:36<3:18:57, 13.34s/it] 11%|█▉                | 106/1000 [51:43<2:53:30, 11.65s/it]                                                            {'loss': 2.7178, 'grad_norm': 0.8824675679206848, 'learning_rate': 0.00019706429546259593, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 711.32, 'epoch': 0.11}
- 11%|█▉                | 106/1000 [51:43<2:53:30, 11.65s/it] 11%|█▉                | 107/1000 [51:51<2:35:44, 10.46s/it]                                                            {'loss': 2.6827, 'grad_norm': 0.9259470105171204, 'learning_rate': 0.00019698588637148703, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 763.68, 'epoch': 0.11}
- 11%|█▉                | 107/1000 [51:51<2:35:44, 10.46s/it] 11%|█▉                | 108/1000 [51:59<2:23:15,  9.64s/it]                                                            {'loss': 2.763, 'grad_norm': 0.8570597171783447, 'learning_rate': 0.00019690645994432305, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 833.06, 'epoch': 0.11}
- 11%|█▉                | 108/1000 [51:59<2:23:15,  9.64s/it] 11%|█▉                | 109/1000 [52:07<2:14:25,  9.05s/it]                                                            {'loss': 2.5746, 'grad_norm': 0.8325176239013672, 'learning_rate': 0.0001968260170142496, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 773.01, 'epoch': 0.11}
- 11%|█▉                | 109/1000 [52:07<2:14:25,  9.05s/it] 11%|█▉                | 110/1000 [52:14<2:08:21,  8.65s/it]                                                            {'loss': 2.4699, 'grad_norm': 0.7705450057983398, 'learning_rate': 0.00019674455842507492, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 990.25, 'epoch': 0.11}
- 11%|█▉                | 110/1000 [52:14<2:08:21,  8.65s/it] 11%|█▉                | 111/1000 [52:22<2:04:04,  8.37s/it]                                                            {'loss': 2.435, 'grad_norm': 0.7997984290122986, 'learning_rate': 0.00019666208503126112, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 948.62, 'epoch': 0.11}
- 11%|█▉                | 111/1000 [52:22<2:04:04,  8.37s/it] 11%|██                | 112/1000 [52:30<2:01:00,  8.18s/it]                                                            {'loss': 2.6524, 'grad_norm': 0.833321750164032, 'learning_rate': 0.00019657859769791505, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1058.14, 'epoch': 0.11}
- 11%|██                | 112/1000 [52:30<2:01:00,  8.18s/it] 11%|██                | 113/1000 [52:37<1:58:45,  8.03s/it]                                                            {'loss': 2.6103, 'grad_norm': 0.8038913011550903, 'learning_rate': 0.00019649409730077935, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 956.43, 'epoch': 0.11}
- 11%|██                | 113/1000 [52:37<1:58:45,  8.03s/it] 11%|██                | 114/1000 [52:45<1:57:05,  7.93s/it]                                                            {'loss': 2.7383, 'grad_norm': 0.8915939927101135, 'learning_rate': 0.00019640858472622316, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 751.35, 'epoch': 0.11}
- 11%|██                | 114/1000 [52:45<1:57:05,  7.93s/it] 12%|██                | 115/1000 [52:53<1:55:58,  7.86s/it]                                                            {'loss': 2.6234, 'grad_norm': 0.7561622858047485, 'learning_rate': 0.00019632206087123296, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1035.59, 'epoch': 0.12}
- 12%|██                | 115/1000 [52:53<1:55:58,  7.86s/it] 12%|██                | 116/1000 [53:00<1:55:07,  7.81s/it]                                                            {'loss': 2.4453, 'grad_norm': 0.8143954873085022, 'learning_rate': 0.00019623452664340306, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 851.92, 'epoch': 0.12}
- 12%|██                | 116/1000 [53:00<1:55:07,  7.81s/it] 12%|██                | 117/1000 [53:08<1:54:26,  7.78s/it]                                                            {'loss': 2.6071, 'grad_norm': 0.8241326212882996, 'learning_rate': 0.000196145982960926, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 841.33, 'epoch': 0.12}
- 12%|██                | 117/1000 [53:08<1:54:26,  7.78s/it] 12%|██                | 118/1000 [53:16<1:54:03,  7.76s/it]                                                            {'loss': 2.4851, 'grad_norm': 0.7440032958984375, 'learning_rate': 0.00019605643075258321, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1081.12, 'epoch': 0.12}
- 12%|██                | 118/1000 [53:16<1:54:03,  7.76s/it] 12%|██▏               | 119/1000 [53:24<1:53:42,  7.74s/it]                                                            {'loss': 2.663, 'grad_norm': 0.8540291786193848, 'learning_rate': 0.00019596587095773495, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 952.89, 'epoch': 0.12}
- 12%|██▏               | 119/1000 [53:24<1:53:42,  7.74s/it] 12%|██▏               | 120/1000 [53:31<1:53:29,  7.74s/it]                                                            {'loss': 2.5209, 'grad_norm': 0.8957391381263733, 'learning_rate': 0.0001958743045263106, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 895.83, 'epoch': 0.12}
- 12%|██▏               | 120/1000 [53:31<1:53:29,  7.74s/it] 12%|██▏               | 121/1000 [53:39<1:53:10,  7.73s/it]                                                            {'loss': 2.7068, 'grad_norm': 1.000033974647522, 'learning_rate': 0.00019578173241879872, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 634.82, 'epoch': 0.12}
- 12%|██▏               | 121/1000 [53:39<1:53:10,  7.73s/it] 12%|██▏               | 122/1000 [53:47<1:52:57,  7.72s/it]                                                            {'loss': 2.4959, 'grad_norm': 0.7940590381622314, 'learning_rate': 0.0001956881556062369, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 920.94, 'epoch': 0.12}
- 12%|██▏               | 122/1000 [53:47<1:52:57,  7.72s/it] 12%|██▏               | 123/1000 [53:54<1:52:44,  7.71s/it]                                                            {'loss': 2.6286, 'grad_norm': 1.022066593170166, 'learning_rate': 0.00019559357507020162, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 507.92, 'epoch': 0.12}
- 12%|██▏               | 123/1000 [53:54<1:52:44,  7.71s/it] 12%|██▏               | 124/1000 [54:02<1:52:41,  7.72s/it]                                                            {'loss': 2.5798, 'grad_norm': 0.6904109716415405, 'learning_rate': 0.00019549799180279792, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1169.21, 'epoch': 0.12}
- 12%|██▏               | 124/1000 [54:02<1:52:41,  7.72s/it] 12%|██▎               | 125/1000 [54:10<1:52:25,  7.71s/it]                                                            {'loss': 2.6275, 'grad_norm': 0.7310277819633484, 'learning_rate': 0.00019540140680664913, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 947.5, 'epoch': 0.12}
- 12%|██▎               | 125/1000 [54:10<1:52:25,  7.71s/it] 13%|██▎               | 126/1000 [54:17<1:52:11,  7.70s/it]                                                            {'loss': 2.7891, 'grad_norm': 0.8295698761940002, 'learning_rate': 0.0001953038210948861, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 917.76, 'epoch': 0.13}
- 13%|██▎               | 126/1000 [54:18<1:52:11,  7.70s/it] 13%|██▎               | 127/1000 [54:25<1:51:58,  7.70s/it]                                                            {'loss': 2.5553, 'grad_norm': 0.7668982744216919, 'learning_rate': 0.00019520523569113677, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 937.2, 'epoch': 0.13}
- 13%|██▎               | 127/1000 [54:25<1:51:58,  7.70s/it] 13%|██▎               | 128/1000 [54:33<1:51:56,  7.70s/it]                                                            {'loss': 2.7057, 'grad_norm': 0.8241111636161804, 'learning_rate': 0.00019510565162951537, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 905.25, 'epoch': 0.13}
- 13%|██▎               | 128/1000 [54:33<1:51:56,  7.70s/it] 13%|██▎               | 129/1000 [54:41<1:51:50,  7.70s/it]                                                            {'loss': 2.6795, 'grad_norm': 0.7360131144523621, 'learning_rate': 0.0001950050699546116, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1036.68, 'epoch': 0.13}
- 13%|██▎               | 129/1000 [54:41<1:51:50,  7.70s/it] 13%|██▎               | 130/1000 [54:48<1:51:38,  7.70s/it]                                                            {'loss': 2.4604, 'grad_norm': 0.7902219891548157, 'learning_rate': 0.00019490349172147963, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 774.7, 'epoch': 0.13}
- 13%|██▎               | 130/1000 [54:48<1:51:38,  7.70s/it] 13%|██▎               | 131/1000 [54:56<1:51:38,  7.71s/it]                                                            {'loss': 2.5411, 'grad_norm': 0.6898303031921387, 'learning_rate': 0.00019480091799562704, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1166.57, 'epoch': 0.13}
- 13%|██▎               | 131/1000 [54:56<1:51:38,  7.71s/it] 13%|██▍               | 132/1000 [55:04<1:51:31,  7.71s/it]                                                            {'loss': 2.4906, 'grad_norm': 0.8677055835723877, 'learning_rate': 0.00019469734985300371, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 606.4, 'epoch': 0.13}
- 13%|██▍               | 132/1000 [55:04<1:51:31,  7.71s/it] 13%|██▍               | 133/1000 [55:11<1:51:17,  7.70s/it]                                                            {'loss': 2.686, 'grad_norm': 0.8123106956481934, 'learning_rate': 0.00019459278837999046, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 802.26, 'epoch': 0.13}
- 13%|██▍               | 133/1000 [55:11<1:51:17,  7.70s/it] 13%|██▍               | 134/1000 [55:19<1:51:07,  7.70s/it]                                                            {'loss': 2.514, 'grad_norm': 0.758438229560852, 'learning_rate': 0.00019448723467338763, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 963.92, 'epoch': 0.13}
- 13%|██▍               | 134/1000 [55:19<1:51:07,  7.70s/it] 14%|██▍               | 135/1000 [55:27<1:51:05,  7.71s/it]                                                            {'loss': 2.7229, 'grad_norm': 0.919758141040802, 'learning_rate': 0.00019438068984040365, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 730.88, 'epoch': 0.14}
- 14%|██▍               | 135/1000 [55:27<1:51:05,  7.71s/it] 14%|██▍               | 136/1000 [55:35<1:50:51,  7.70s/it]                                                            {'loss': 2.5956, 'grad_norm': 0.863878607749939, 'learning_rate': 0.00019427315499864344, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 815.98, 'epoch': 0.14}
- 14%|██▍               | 136/1000 [55:35<1:50:51,  7.70s/it] 14%|██▍               | 137/1000 [55:42<1:50:43,  7.70s/it]                                                            {'loss': 2.5635, 'grad_norm': 0.6907688975334167, 'learning_rate': 0.00019416463127609656, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1254.8, 'epoch': 0.14}
- 14%|██▍               | 137/1000 [55:42<1:50:43,  7.70s/it] 14%|██▍               | 138/1000 [55:50<1:50:41,  7.70s/it]                                                            {'loss': 2.5411, 'grad_norm': 0.8854458928108215, 'learning_rate': 0.0001940551198111255, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 799.29, 'epoch': 0.14}
- 14%|██▍               | 138/1000 [55:50<1:50:41,  7.70s/it] 14%|██▌               | 139/1000 [55:58<1:50:29,  7.70s/it]                                                            {'loss': 2.5859, 'grad_norm': 0.8531408309936523, 'learning_rate': 0.00019394462175245381, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1007.05, 'epoch': 0.14}
- 14%|██▌               | 139/1000 [55:58<1:50:29,  7.70s/it] 14%|██▌               | 140/1000 [56:05<1:50:16,  7.69s/it]                                                            {'loss': 2.6093, 'grad_norm': 0.73235023021698, 'learning_rate': 0.0001938331382591537, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1100.41, 'epoch': 0.14}
- 14%|██▌               | 140/1000 [56:05<1:50:16,  7.69s/it] 14%|██▌               | 141/1000 [56:13<1:50:11,  7.70s/it]                                                            {'loss': 2.6661, 'grad_norm': 0.8245342969894409, 'learning_rate': 0.00019372067050063438, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 828.57, 'epoch': 0.14}
- 14%|██▌               | 141/1000 [56:13<1:50:11,  7.70s/it] 14%|██▌               | 142/1000 [56:21<1:49:58,  7.69s/it]                                                            {'loss': 2.4158, 'grad_norm': 1.1080478429794312, 'learning_rate': 0.00019360721965662933, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 377.01, 'epoch': 0.14}
- 14%|██▌               | 142/1000 [56:21<1:49:58,  7.69s/it] 14%|██▌               | 143/1000 [56:28<1:49:57,  7.70s/it]                                                            {'loss': 2.4901, 'grad_norm': 0.8388444781303406, 'learning_rate': 0.00019349278691718427, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 777.18, 'epoch': 0.14}
- 14%|██▌               | 143/1000 [56:28<1:49:57,  7.70s/it] 14%|██▌               | 144/1000 [56:36<1:49:51,  7.70s/it]                                                            {'loss': 2.7123, 'grad_norm': 0.8535293936729431, 'learning_rate': 0.00019337737348264447, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 780.83, 'epoch': 0.14}
- 14%|██▌               | 144/1000 [56:36<1:49:51,  7.70s/it] 14%|██▌               | 145/1000 [56:44<1:49:41,  7.70s/it]                                                            {'loss': 2.6937, 'grad_norm': 1.0128860473632812, 'learning_rate': 0.00019326098056364222, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 568.16, 'epoch': 0.14}
- 14%|██▌               | 145/1000 [56:44<1:49:41,  7.70s/it] 15%|██▋               | 146/1000 [56:51<1:49:32,  7.70s/it]                                                            {'loss': 2.7344, 'grad_norm': 0.7840571403503418, 'learning_rate': 0.00019314360938108425, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1089.71, 'epoch': 0.15}
- 15%|██▋               | 146/1000 [56:51<1:49:32,  7.70s/it] 15%|██▋               | 147/1000 [56:59<1:49:23,  7.69s/it]                                                            {'loss': 2.4024, 'grad_norm': 0.7362533211708069, 'learning_rate': 0.00019302526116613864, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 980.14, 'epoch': 0.15}
- 15%|██▋               | 147/1000 [56:59<1:49:23,  7.69s/it] 15%|██▋               | 148/1000 [57:07<1:49:22,  7.70s/it]                                                            {'loss': 2.6667, 'grad_norm': 0.7476270198822021, 'learning_rate': 0.00019290593716022217, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1124.12, 'epoch': 0.15}
- 15%|██▋               | 148/1000 [57:07<1:49:22,  7.70s/it] 15%|██▋               | 149/1000 [57:15<1:49:14,  7.70s/it]                                                            {'loss': 2.4112, 'grad_norm': 0.850767970085144, 'learning_rate': 0.00019278563861498723, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 695.97, 'epoch': 0.15}
- 15%|██▋               | 149/1000 [57:15<1:49:14,  7.70s/it] 15%|██▋               | 150/1000 [57:22<1:49:03,  7.70s/it]                                                            {'loss': 2.6275, 'grad_norm': 0.9258683323860168, 'learning_rate': 0.00019266436679230865, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 653.12, 'epoch': 0.15}
- 15%|██▋               | 150/1000 [57:22<1:49:03,  7.70s/it][2025-10-18 20:00:09,408] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 20:00:12,311] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3944275379180908
-[2025-10-18 20:00:13,741] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4293632507324219
-[2025-10-18 20:00:15,114] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3733999729156494
-[2025-10-18 20:00:16,532] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.417267084121704
-[2025-10-18 20:00:16,532] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:28,  6.11it/s][A
-  2%|▍                      | 3/179 [00:00<00:40,  4.32it/s][A
-  2%|▌                      | 4/179 [00:00<00:46,  3.77it/s][A
-  3%|▋                      | 5/179 [00:01<01:18,  2.22it/s][A
-  3%|▊                      | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|▉                      | 7/179 [00:02<01:04,  2.65it/s][A
-  4%|█                      | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▏                     | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▏                    | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▎                    | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▍                    | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▌                    | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▋                    | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.78it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.86it/s][A
-  9%|██                    | 17/179 [00:06<01:05,  2.47it/s][A
- 10%|██▏                   | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▎                   | 19/179 [00:06<00:57,  2.80it/s][A
- 11%|██▍                   | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▌                   | 21/179 [00:07<01:03,  2.47it/s][A
- 12%|██▋                   | 22/179 [00:07<00:58,  2.69it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|██▉                   | 24/179 [00:08<00:53,  2.87it/s][A
- 14%|███                   | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.70it/s][A
- 15%|███▎                  | 27/179 [00:09<00:54,  2.81it/s][A
- 16%|███▍                  | 28/179 [00:10<00:52,  2.89it/s][A
- 16%|███▌                  | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|███▋                  | 30/179 [00:10<00:55,  2.70it/s][A
- 17%|███▊                  | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|███▉                  | 32/179 [00:11<00:51,  2.88it/s][A
- 18%|████                  | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▏                 | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▎                 | 35/179 [00:12<00:51,  2.81it/s][A
- 20%|████▍                 | 36/179 [00:13<00:49,  2.86it/s][A
- 21%|████▌                 | 37/179 [00:13<00:57,  2.46it/s][A
- 21%|████▋                 | 38/179 [00:13<00:52,  2.67it/s][A
- 22%|████▊                 | 39/179 [00:14<00:50,  2.79it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.87it/s][A
- 23%|█████                 | 41/179 [00:15<00:55,  2.47it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.80it/s][A
- 25%|█████▍                | 44/179 [00:16<00:46,  2.89it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.48it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.71it/s][A
- 26%|█████▊                | 47/179 [00:17<00:46,  2.81it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.89it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.48it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.71it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.81it/s][A
- 29%|██████▍               | 52/179 [00:19<00:43,  2.89it/s][A
- 30%|██████▌               | 53/179 [00:19<00:50,  2.48it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.70it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.80it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|███████▏              | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.79it/s][A
- 34%|███████▎              | 60/179 [00:21<00:41,  2.86it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.69it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.80it/s][A
- 36%|███████▊              | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|███████▉              | 65/179 [00:23<00:46,  2.47it/s][A
- 37%|████████              | 66/179 [00:24<00:42,  2.68it/s][A
- 37%|████████▏             | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|████████▎             | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.88it/s][A
- 41%|████████▉             | 73/179 [00:26<00:43,  2.46it/s][A
- 41%|█████████             | 74/179 [00:27<00:39,  2.69it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:42,  2.42it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.67it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.78it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.47it/s][A
- 46%|██████████            | 82/179 [00:30<00:36,  2.68it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.86it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:38,  2.46it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.68it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:33,  2.77it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:31,  2.86it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:36,  2.46it/s][A
- 50%|███████████           | 90/179 [00:33<00:33,  2.68it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.78it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.86it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:34,  2.48it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.70it/s][A
- 53%|███████████▋          | 95/179 [00:35<00:29,  2.81it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.70it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.80it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.70it/s][A
- 58%|████████████         | 103/179 [00:37<00:27,  2.80it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.88it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:29,  2.47it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|████████████▌        | 107/179 [00:39<00:25,  2.78it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.86it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.69it/s][A
- 62%|█████████████        | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:22,  2.79it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|█████████████▉       | 119/179 [00:43<00:21,  2.78it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.46it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.79it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.87it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.46it/s][A
- 75%|███████████████▋     | 134/179 [00:49<00:16,  2.68it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.78it/s][A
- 76%|███████████████▉     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.67it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.78it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.86it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.67it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.77it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.46it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.68it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.70it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.88it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|██████████████████   | 154/179 [00:56<00:09,  2.69it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.80it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:08,  2.87it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.46it/s][A
- 91%|███████████████████  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.87it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.86it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.68it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.79it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.86it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.68it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.87it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.70it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.57it/s][A                                                            
-                                                            [A{'eval_loss': 2.5409650802612305, 'eval_runtime': 68.5506, 'eval_samples_per_second': 2.859, 'eval_steps_per_second': 1.43, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.15}
- 15%|██▋               | 150/1000 [58:38<1:49:03,  7.70s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.57it/s][A
-                                                            [A[2025-10-18 20:01:25,090] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-150
- 15%|██▋               | 151/1000 [58:49<7:23:02, 31.31s/it]                                                            {'loss': 2.7898, 'grad_norm': 0.7623653411865234, 'learning_rate': 0.00019254212296427044, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1116.72, 'epoch': 0.15}
- 15%|██▋               | 151/1000 [58:49<7:23:02, 31.31s/it] 15%|██▋               | 152/1000 [58:56<5:42:20, 24.22s/it]                                                            {'loss': 2.5995, 'grad_norm': 0.8514937162399292, 'learning_rate': 0.00019241890841315248, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 800.53, 'epoch': 0.15}
- 15%|██▋               | 152/1000 [58:56<5:42:20, 24.22s/it] 15%|██▊               | 153/1000 [59:04<4:31:48, 19.25s/it]                                                            {'loss': 2.6328, 'grad_norm': 0.851947009563446, 'learning_rate': 0.0001922947244314172, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 727.44, 'epoch': 0.15}
- 15%|██▊               | 153/1000 [59:04<4:31:48, 19.25s/it] 15%|██▊               | 154/1000 [59:12<3:42:26, 15.78s/it]                                                            {'loss': 2.6028, 'grad_norm': 0.838660478591919, 'learning_rate': 0.0001921695723216957, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 771.94, 'epoch': 0.15}
- 15%|██▊               | 154/1000 [59:12<3:42:26, 15.78s/it] 16%|██▊               | 155/1000 [59:19<3:07:59, 13.35s/it]                                                            {'loss': 2.4803, 'grad_norm': 0.7901943922042847, 'learning_rate': 0.00019204345339677442, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 864.65, 'epoch': 0.15}
- 16%|██▊               | 155/1000 [59:19<3:07:59, 13.35s/it] 16%|██▊               | 156/1000 [59:27<2:43:37, 11.63s/it]                                                            {'loss': 2.5621, 'grad_norm': 0.8262662291526794, 'learning_rate': 0.00019191636897958122, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 820.04, 'epoch': 0.16}
- 16%|██▊               | 156/1000 [59:27<2:43:37, 11.63s/it] 16%|██▊               | 157/1000 [59:35<2:26:46, 10.45s/it]                                                            {'loss': 2.6357, 'grad_norm': 0.9234705567359924, 'learning_rate': 0.00019178832040317155, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 711.89, 'epoch': 0.16}
- 16%|██▊               | 157/1000 [59:35<2:26:46, 10.45s/it] 16%|██▊               | 158/1000 [59:42<2:15:03,  9.62s/it]                                                            {'loss': 2.7918, 'grad_norm': 0.8142087459564209, 'learning_rate': 0.0001916593090107143, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1011.92, 'epoch': 0.16}
- 16%|██▊               | 158/1000 [59:42<2:15:03,  9.62s/it] 16%|██▊               | 159/1000 [59:50<2:06:52,  9.05s/it]                                                            {'loss': 2.6481, 'grad_norm': 0.7625157237052917, 'learning_rate': 0.00019152933615547798, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1034.27, 'epoch': 0.16}
- 16%|██▊               | 159/1000 [59:50<2:06:52,  9.05s/it] 16%|██▉               | 160/1000 [59:58<2:00:59,  8.64s/it]                                                            {'loss': 2.6044, 'grad_norm': 0.7401595711708069, 'learning_rate': 0.0001913984032008163, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1009.57, 'epoch': 0.16}
- 16%|██▉               | 160/1000 [59:58<2:00:59,  8.64s/it] 16%|██▌             | 161/1000 [1:00:06<1:56:54,  8.36s/it]                                                            {'loss': 2.3912, 'grad_norm': 0.6604964733123779, 'learning_rate': 0.00019126651152015403, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1237.6, 'epoch': 0.16}
- 16%|██▌             | 161/1000 [1:00:06<1:56:54,  8.36s/it] 16%|██▌             | 162/1000 [1:00:13<1:53:58,  8.16s/it]                                                            {'loss': 2.7, 'grad_norm': 0.7156261801719666, 'learning_rate': 0.0001911336624969725, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1160.99, 'epoch': 0.16}
- 16%|██▌             | 162/1000 [1:00:13<1:53:58,  8.16s/it] 16%|██▌             | 163/1000 [1:00:21<1:51:53,  8.02s/it]                                                            {'loss': 2.4756, 'grad_norm': 0.8412548303604126, 'learning_rate': 0.00019099985752479506, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 863.21, 'epoch': 0.16}
- 16%|██▌             | 163/1000 [1:00:21<1:51:53,  8.02s/it] 16%|██▌             | 164/1000 [1:00:29<1:50:23,  7.92s/it]                                                            {'loss': 2.4952, 'grad_norm': 0.7661089897155762, 'learning_rate': 0.00019086509800717258, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 895.56, 'epoch': 0.16}
- 16%|██▌             | 164/1000 [1:00:29<1:50:23,  7.92s/it] 16%|██▋             | 165/1000 [1:00:36<1:49:20,  7.86s/it]                                                            {'loss': 2.5454, 'grad_norm': 0.7496563792228699, 'learning_rate': 0.00019072938535766865, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1089.37, 'epoch': 0.17}
- 16%|██▋             | 165/1000 [1:00:36<1:49:20,  7.86s/it] 17%|██▋             | 166/1000 [1:00:44<1:48:38,  7.82s/it]                                                            {'loss': 2.4388, 'grad_norm': 0.6932787895202637, 'learning_rate': 0.0001905927209998447, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1158.97, 'epoch': 0.17}
- 17%|██▋             | 166/1000 [1:00:44<1:48:38,  7.82s/it] 17%|██▋             | 167/1000 [1:00:52<1:47:59,  7.78s/it]                                                            {'loss': 2.4348, 'grad_norm': 1.028424620628357, 'learning_rate': 0.0001904551063672452, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 506.92, 'epoch': 0.17}
- 17%|██▋             | 167/1000 [1:00:52<1:47:59,  7.78s/it] 17%|██▋             | 168/1000 [1:00:59<1:47:35,  7.76s/it]                                                            {'loss': 2.6918, 'grad_norm': 0.8657194375991821, 'learning_rate': 0.00019031654290338254, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 901.55, 'epoch': 0.17}
- 17%|██▋             | 168/1000 [1:00:59<1:47:35,  7.76s/it] 17%|██▋             | 169/1000 [1:01:07<1:47:10,  7.74s/it]                                                            {'loss': 2.5457, 'grad_norm': 0.8360164165496826, 'learning_rate': 0.00019017703206172185, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 903.18, 'epoch': 0.17}
- 17%|██▋             | 169/1000 [1:01:07<1:47:10,  7.74s/it] 17%|██▋             | 170/1000 [1:01:15<1:46:57,  7.73s/it]                                                            {'loss': 2.3703, 'grad_norm': 0.7944968342781067, 'learning_rate': 0.0001900365753056659, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 876.26, 'epoch': 0.17}
- 17%|██▋             | 170/1000 [1:01:15<1:46:57,  7.73s/it] 17%|██▋             | 171/1000 [1:01:23<1:46:42,  7.72s/it]                                                            {'loss': 2.5479, 'grad_norm': 0.7865543961524963, 'learning_rate': 0.00018989517410853955, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 885.54, 'epoch': 0.17}
- 17%|██▋             | 171/1000 [1:01:23<1:46:42,  7.72s/it] 17%|██▊             | 172/1000 [1:01:30<1:46:26,  7.71s/it]                                                            {'loss': 2.3944, 'grad_norm': 0.854613184928894, 'learning_rate': 0.00018975282995357446, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 654.54, 'epoch': 0.17}
- 17%|██▊             | 172/1000 [1:01:30<1:46:26,  7.71s/it] 17%|██▊             | 173/1000 [1:01:38<1:46:15,  7.71s/it]                                                            {'loss': 2.1964, 'grad_norm': 0.7296493053436279, 'learning_rate': 0.00018960954433389345, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1150.62, 'epoch': 0.17}
- 17%|██▊             | 173/1000 [1:01:38<1:46:15,  7.71s/it] 17%|██▊             | 174/1000 [1:01:46<1:46:01,  7.70s/it]                                                            {'loss': 2.4938, 'grad_norm': 0.879060685634613, 'learning_rate': 0.00018946531875249493, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 714.54, 'epoch': 0.17}
- 17%|██▊             | 174/1000 [1:01:46<1:46:01,  7.70s/it] 18%|██▊             | 175/1000 [1:01:53<1:46:00,  7.71s/it]                                                            {'loss': 2.8463, 'grad_norm': 0.8984085321426392, 'learning_rate': 0.00018932015472223693, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 799.96, 'epoch': 0.17}
- 18%|██▊             | 175/1000 [1:01:53<1:46:00,  7.71s/it] 18%|██▊             | 176/1000 [1:02:01<1:45:49,  7.71s/it]                                                            {'loss': 2.5829, 'grad_norm': 0.822972297668457, 'learning_rate': 0.00018917405376582145, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 897.95, 'epoch': 0.18}
- 18%|██▊             | 176/1000 [1:02:01<1:45:49,  7.71s/it] 18%|██▊             | 177/1000 [1:02:09<1:45:35,  7.70s/it]                                                            {'loss': 2.4156, 'grad_norm': 1.1078928709030151, 'learning_rate': 0.0001890270174157784, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 443.89, 'epoch': 0.18}
- 18%|██▊             | 177/1000 [1:02:09<1:45:35,  7.70s/it] 18%|██▊             | 178/1000 [1:02:16<1:45:28,  7.70s/it]                                                            {'loss': 2.5659, 'grad_norm': 0.7888846397399902, 'learning_rate': 0.00018887904721444953, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 836.5, 'epoch': 0.18}
- 18%|██▊             | 178/1000 [1:02:16<1:45:28,  7.70s/it] 18%|██▊             | 179/1000 [1:02:24<1:45:23,  7.70s/it]                                                            {'loss': 2.7451, 'grad_norm': 0.775917112827301, 'learning_rate': 0.00018873014471397224, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 920.84, 'epoch': 0.18}
- 18%|██▊             | 179/1000 [1:02:24<1:45:23,  7.70s/it] 18%|██▉             | 180/1000 [1:02:32<1:45:07,  7.69s/it]                                                            {'loss': 2.5683, 'grad_norm': 0.9729416370391846, 'learning_rate': 0.00018858031147626325, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 762.15, 'epoch': 0.18}
- 18%|██▉             | 180/1000 [1:02:32<1:45:07,  7.69s/it] 18%|██▉             | 181/1000 [1:02:40<1:45:06,  7.70s/it]                                                            {'loss': 2.5246, 'grad_norm': 0.7239236235618591, 'learning_rate': 0.00018842954907300236, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1020.3, 'epoch': 0.18}
- 18%|██▉             | 181/1000 [1:02:40<1:45:06,  7.70s/it] 18%|██▉             | 182/1000 [1:02:47<1:44:58,  7.70s/it]                                                            {'loss': 2.6746, 'grad_norm': 0.7856016755104065, 'learning_rate': 0.00018827785908561584, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 972.92, 'epoch': 0.18}
- 18%|██▉             | 182/1000 [1:02:47<1:44:58,  7.70s/it] 18%|██▉             | 183/1000 [1:02:55<1:44:51,  7.70s/it]                                                            {'loss': 2.6147, 'grad_norm': 1.1748241186141968, 'learning_rate': 0.0001881252431052599, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 475.01, 'epoch': 0.18}
- 18%|██▉             | 183/1000 [1:02:55<1:44:51,  7.70s/it] 18%|██▉             | 184/1000 [1:03:03<1:44:38,  7.69s/it]                                                            {'loss': 2.7514, 'grad_norm': 0.7395161986351013, 'learning_rate': 0.00018797170273280388, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1117.47, 'epoch': 0.18}
- 18%|██▉             | 184/1000 [1:03:03<1:44:38,  7.69s/it] 18%|██▉             | 185/1000 [1:03:10<1:44:35,  7.70s/it]                                                            {'loss': 2.429, 'grad_norm': 0.7633015513420105, 'learning_rate': 0.00018781723957881372, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 949.19, 'epoch': 0.18}
- 18%|██▉             | 185/1000 [1:03:10<1:44:35,  7.70s/it] 19%|██▉             | 186/1000 [1:03:18<1:44:29,  7.70s/it]                                                            {'loss': 2.6407, 'grad_norm': 0.7387210130691528, 'learning_rate': 0.0001876618552635348, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1043.95, 'epoch': 0.19}
- 19%|██▉             | 186/1000 [1:03:18<1:44:29,  7.70s/it] 19%|██▉             | 187/1000 [1:03:26<1:44:17,  7.70s/it]                                                            {'loss': 2.3151, 'grad_norm': 0.7511385083198547, 'learning_rate': 0.000187505551416875, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 913.39, 'epoch': 0.19}
- 19%|██▉             | 187/1000 [1:03:26<1:44:17,  7.70s/it] 19%|███             | 188/1000 [1:03:33<1:44:03,  7.69s/it]                                                            {'loss': 2.5173, 'grad_norm': 0.733252227306366, 'learning_rate': 0.00018734832967838775, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 973.56, 'epoch': 0.19}
- 19%|███             | 188/1000 [1:03:33<1:44:03,  7.69s/it] 19%|███             | 189/1000 [1:03:41<1:44:00,  7.69s/it]                                                            {'loss': 2.4854, 'grad_norm': 0.74712073802948, 'learning_rate': 0.00018719019169725472, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 940.32, 'epoch': 0.19}
- 19%|███             | 189/1000 [1:03:41<1:44:00,  7.69s/it] 19%|███             | 190/1000 [1:03:49<1:43:43,  7.68s/it]                                                            {'loss': 2.3441, 'grad_norm': 0.8071753978729248, 'learning_rate': 0.00018703113913226847, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 897.76, 'epoch': 0.19}
- 19%|███             | 190/1000 [1:03:49<1:43:43,  7.68s/it] 19%|███             | 191/1000 [1:03:56<1:43:42,  7.69s/it]                                                            {'loss': 2.6181, 'grad_norm': 0.7988425493240356, 'learning_rate': 0.00018687117365181512, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 926.89, 'epoch': 0.19}
- 19%|███             | 191/1000 [1:03:56<1:43:42,  7.69s/it] 19%|███             | 192/1000 [1:04:04<1:43:38,  7.70s/it]                                                            {'loss': 2.4741, 'grad_norm': 0.8813150525093079, 'learning_rate': 0.0001867102969338569, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 737.99, 'epoch': 0.19}
- 19%|███             | 192/1000 [1:04:04<1:43:38,  7.70s/it] 19%|███             | 193/1000 [1:04:12<1:43:22,  7.69s/it]                                                            {'loss': 2.5879, 'grad_norm': 0.9948791861534119, 'learning_rate': 0.00018654851066591448, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 593.04, 'epoch': 0.19}
- 19%|███             | 193/1000 [1:04:12<1:43:22,  7.69s/it] 19%|███             | 194/1000 [1:04:20<1:43:17,  7.69s/it]                                                            {'loss': 2.4327, 'grad_norm': 0.9575774073600769, 'learning_rate': 0.0001863858165450492, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 613.43, 'epoch': 0.19}
- 19%|███             | 194/1000 [1:04:20<1:43:17,  7.69s/it] 20%|███             | 195/1000 [1:04:27<1:43:11,  7.69s/it]                                                            {'loss': 2.6289, 'grad_norm': 0.8826113939285278, 'learning_rate': 0.0001862222162778454, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 964.23, 'epoch': 0.2}
- 20%|███             | 195/1000 [1:04:27<1:43:11,  7.69s/it] 20%|███▏            | 196/1000 [1:04:35<1:43:05,  7.69s/it]                                                            {'loss': 2.633, 'grad_norm': 0.904585063457489, 'learning_rate': 0.00018605771158039253, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 799.07, 'epoch': 0.2}
- 20%|███▏            | 196/1000 [1:04:35<1:43:05,  7.69s/it] 20%|███▏            | 197/1000 [1:04:43<1:43:02,  7.70s/it]                                                            {'loss': 2.7991, 'grad_norm': 0.8817514181137085, 'learning_rate': 0.00018589230417826697, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 802.89, 'epoch': 0.2}
- 20%|███▏            | 197/1000 [1:04:43<1:43:02,  7.70s/it] 20%|███▏            | 198/1000 [1:04:50<1:42:53,  7.70s/it]                                                            {'loss': 2.7115, 'grad_norm': 0.9352730512619019, 'learning_rate': 0.00018572599580651415, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 807.57, 'epoch': 0.2}
- 20%|��██▏            | 198/1000 [1:04:50<1:42:53,  7.70s/it] 20%|███▏            | 199/1000 [1:04:58<1:42:46,  7.70s/it]                                                            {'loss': 2.5218, 'grad_norm': 0.9579911231994629, 'learning_rate': 0.00018555878820963013, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 642.77, 'epoch': 0.2}
- 20%|███▏            | 199/1000 [1:04:58<1:42:46,  7.70s/it] 20%|███▏            | 200/1000 [1:05:06<1:42:33,  7.69s/it]                                                            {'loss': 2.8182, 'grad_norm': 0.8023660182952881, 'learning_rate': 0.00018539068314154354, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 974.0, 'epoch': 0.2}
- 20%|███▏            | 200/1000 [1:05:06<1:42:33,  7.69s/it][2025-10-18 20:07:52,802] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 20:07:55,721] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4153876304626465
-[2025-10-18 20:07:57,101] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.379310131072998
-[2025-10-18 20:07:58,501] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4002583026885986
-[2025-10-18 20:07:59,943] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4417080879211426
-[2025-10-18 20:07:59,944] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:28,  6.12it/s][A
-  2%|▍                      | 3/179 [00:00<00:41,  4.29it/s][A
-  2%|▌                      | 4/179 [00:00<00:46,  3.74it/s][A
-  3%|▋                      | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                      | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|▉                      | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█                      | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▏                     | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▏                    | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▎                    | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▍                    | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▌                    | 13/179 [00:04<01:07,  2.46it/s][A
-  8%|█▋                    | 14/179 [00:05<01:01,  2.69it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.80it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██                    | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▏                   | 18/179 [00:06<01:00,  2.68it/s][A
- 11%|██▎                   | 19/179 [00:06<00:57,  2.78it/s][A
- 11%|██▍                   | 20/179 [00:07<00:55,  2.86it/s][A
- 12%|██▌                   | 21/179 [00:07<01:04,  2.46it/s][A
- 12%|██▋                   | 22/179 [00:08<00:58,  2.68it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.79it/s][A
- 13%|██▉                   | 24/179 [00:08<00:54,  2.86it/s][A
- 14%|███                   | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▎                  | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▍                  | 28/179 [00:10<00:52,  2.85it/s][A
- 16%|███▌                  | 29/179 [00:10<01:01,  2.45it/s][A
- 17%|███▋                  | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|███▊                  | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|███▉                  | 32/179 [00:11<00:51,  2.86it/s][A
- 18%|████                  | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▏                 | 34/179 [00:12<00:53,  2.69it/s][A
- 20%|████▎                 | 35/179 [00:12<00:51,  2.81it/s][A
- 20%|████▍                 | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|████▌                 | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|████▋                 | 38/179 [00:13<00:52,  2.70it/s][A
- 22%|████▊                 | 39/179 [00:14<00:49,  2.80it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.89it/s][A
- 23%|█████                 | 41/179 [00:15<00:55,  2.47it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.80it/s][A
- 25%|█████▍                | 44/179 [00:16<00:47,  2.87it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.48it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|█████▊                | 47/179 [00:17<00:47,  2.80it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.88it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.48it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.70it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.80it/s][A
- 29%|██████▍               | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|██████▌               | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.69it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|███████▏              | 58/179 [00:21<00:45,  2.69it/s][A
- 33%|███████▎              | 59/179 [00:21<00:43,  2.79it/s][A
- 34%|███████▎              | 60/179 [00:22<00:41,  2.86it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.46it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.80it/s][A
- 36%|███████▊              | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|███████▉              | 65/179 [00:24<00:46,  2.48it/s][A
- 37%|████████              | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|████████▏             | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|████████▎             | 68/179 [00:24<00:38,  2.86it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.88it/s][A
- 41%|████████▉             | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|█████████             | 74/179 [00:27<00:38,  2.69it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:41,  2.46it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.80it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.47it/s][A
- 46%|██████████            | 82/179 [00:30<00:35,  2.70it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:37,  2.48it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.70it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:32,  2.80it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:31,  2.88it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:36,  2.48it/s][A
- 50%|███████████           | 90/179 [00:33<00:32,  2.70it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.80it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.88it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:34,  2.48it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.70it/s][A
- 53%|███████████▋          | 95/179 [00:34<00:29,  2.80it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.70it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.78it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.85it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|████████████         | 103/179 [00:37<00:27,  2.80it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.87it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.70it/s][A
- 60%|███████���████▌        | 107/179 [00:39<00:25,  2.80it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.87it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.70it/s][A
- 62%|█████████████        | 111/179 [00:40<00:24,  2.81it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.88it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.48it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:22,  2.81it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:21,  2.87it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.47it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.70it/s][A
- 66%|█████████████▉       | 119/179 [00:43<00:21,  2.81it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.88it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:24,  2.41it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.62it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:20,  2.74it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.83it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:22,  2.45it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.67it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.77it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.84it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.44it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.67it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.78it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.86it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.46it/s][A
- 75%|███████████████▋     | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|███████████████▉     | 136/179 [00:50<00:14,  2.87it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.70it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.79it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.77it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.44it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|██████████████████   | 154/179 [00:56<00:09,  2.70it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.80it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.47it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████���▋  | 159/179 [00:58<00:07,  2.79it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.46it/s][A
- 91%|███████████████████  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.80it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.88it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.46it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.88it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.48it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.70it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.87it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.48it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.78it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.46it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.68it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.53it/s][A                                                            
-                                                            [A{'eval_loss': 2.5223703384399414, 'eval_runtime': 68.6693, 'eval_samples_per_second': 2.854, 'eval_steps_per_second': 1.427, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.2}
- 20%|███▏            | 200/1000 [1:06:21<1:42:33,  7.69s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.53it/s][A
-                                                            [A[2025-10-18 20:09:08,622] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-200
- 20%|███▏            | 201/1000 [1:06:33<6:59:19, 31.49s/it]                                                            {'loss': 2.5034, 'grad_norm': 0.8462835550308228, 'learning_rate': 0.00018522168236559695, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 789.73, 'epoch': 0.2}
- 20%|███▏            | 201/1000 [1:06:33<6:59:19, 31.49s/it] 20%|███▏            | 202/1000 [1:06:40<5:23:51, 24.35s/it]                                                            {'loss': 2.5937, 'grad_norm': 0.7525960803031921, 'learning_rate': 0.00018505178765452853, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1044.13, 'epoch': 0.2}
- 20%|███▏            | 202/1000 [1:06:40<5:23:51, 24.35s/it] 20%|███▏            | 203/1000 [1:06:48<4:17:03, 19.35s/it]                                                            {'loss': 2.4964, 'grad_norm': 0.7811993360519409, 'learning_rate': 0.00018488100079045344, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 948.58, 'epoch': 0.2}
- 20%|███▏            | 203/1000 [1:06:48<4:17:03, 19.35s/it] 20%|███▎            | 204/1000 [1:06:56<3:30:09, 15.84s/it]                                                            {'loss': 2.539, 'grad_norm': 1.081154465675354, 'learning_rate': 0.00018470932356484508, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 461.7, 'epoch': 0.2}
- 20%|███▎            | 204/1000 [1:06:56<3:30:09, 15.84s/it] 20%|███▎            | 205/1000 [1:07:03<2:57:23, 13.39s/it]                                                            {'loss': 2.6144, 'grad_norm': 0.7866605520248413, 'learning_rate': 0.00018453675777851627, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 899.79, 'epoch': 0.2}
- 20%|███▎            | 205/1000 [1:07:03<2:57:23, 13.39s/it] 21%|███▎            | 206/1000 [1:07:11<2:34:34, 11.68s/it]                                                            {'loss': 2.6113, 'grad_norm': 0.934302031993866, 'learning_rate': 0.00018436330524160047, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 583.13, 'epoch': 0.21}
- 21%|███▎            | 206/1000 [1:07:11<2:34:34, 11.68s/it] 21%|███▎            | 207/1000 [1:07:19<2:18:37, 10.49s/it]                                                            {'loss': 2.5378, 'grad_norm': 0.7828210592269897, 'learning_rate': 0.0001841889677735327, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 955.13, 'epoch': 0.21}
- 21%|███▎            | 207/1000 [1:07:19<2:18:37, 10.49s/it] 21%|███▎            | 208/1000 [1:07:26<2:07:22,  9.65s/it]                                                            {'loss': 2.7234, 'grad_norm': 0.7601350545883179, 'learning_rate': 0.00018401374720303056, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1033.11, 'epoch': 0.21}
- 21%|███▎            | 208/1000 [1:07:26<2:07:22,  9.65s/it] 21%|███▎            | 209/1000 [1:07:34<1:59:28,  9.06s/it]                                                            {'loss': 2.6186, 'grad_norm': 0.873517632484436, 'learning_rate': 0.00018383764536807485, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 727.58, 'epoch': 0.21}
- 21%|███▎            | 209/1000 [1:07:34<1:59:28,  9.06s/it] 21%|███▎            | 210/1000 [1:07:42<1:53:51,  8.65s/it]                                                            {'loss': 2.6191, 'grad_norm': 0.864784836769104, 'learning_rate': 0.0001836606641158905, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 744.55, 'epoch': 0.21}
- 21%|███▎            | 210/1000 [1:07:42<1:53:51,  8.65s/it] 21%|███▍            | 211/1000 [1:07:50<1:49:54,  8.36s/it]                                                            {'loss': 2.5766, 'grad_norm': 0.8617127537727356, 'learning_rate': 0.00018348280530292713, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1065.32, 'epoch': 0.21}
- 21%|███▍            | 211/1000 [1:07:50<1:49:54,  8.36s/it] 21%|███▍            | 212/1000 [1:07:57<1:47:09,  8.16s/it]                                                            {'loss': 2.4749, 'grad_norm': 0.6952270865440369, 'learning_rate': 0.00018330407079483952, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1201.56, 'epoch': 0.21}
- 21%|███▍            | 212/1000 [1:07:57<1:47:09,  8.16s/it] 21%|███▍            | 213/1000 [1:08:05<1:45:04,  8.01s/it]                                                            {'loss': 2.5784, 'grad_norm': 0.7723666429519653, 'learning_rate': 0.0001831244624664681, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 980.98, 'epoch': 0.21}
- 21%|███▍            | 213/1000 [1:08:05<1:45:04,  8.01s/it] 21%|███▍            | 214/1000 [1:08:13<1:43:40,  7.91s/it]                                                            {'loss': 2.5261, 'grad_norm': 0.8330923318862915, 'learning_rate': 0.00018294398220181917, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 785.79, 'epoch': 0.21}
- 21%|███▍            | 214/1000 [1:08:13<1:43:40,  7.91s/it] 22%|███▍            | 215/1000 [1:08:20<1:42:45,  7.85s/it]                                                            {'loss': 2.6191, 'grad_norm': 0.8040612936019897, 'learning_rate': 0.0001827626318940454, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 900.48, 'epoch': 0.21}
- 22%|███▍            | 215/1000 [1:08:20<1:42:45,  7.85s/it] 22%|███▍            | 216/1000 [1:08:28<1:41:58,  7.80s/it]                                                            {'loss': 2.6265, 'grad_norm': 0.8339094519615173, 'learning_rate': 0.00018258041344542566, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 906.59, 'epoch': 0.22}
- 22%|███▍            | 216/1000 [1:08:28<1:41:58,  7.80s/it] 22%|███▍            | 217/1000 [1:08:36<1:41:19,  7.76s/it]                                                            {'loss': 2.7403, 'grad_norm': 0.891198992729187, 'learning_rate': 0.00018239732876734527, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 781.33, 'epoch': 0.22}
- 22%|███▍            | 217/1000 [1:08:36<1:41:19,  7.76s/it] 22%|███▍            | 218/1000 [1:08:43<1:40:45,  7.73s/it]                                                            {'loss': 2.4186, 'grad_norm': 0.7664593458175659, 'learning_rate': 0.00018221337978027583, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 963.27, 'epoch': 0.22}
- 22%|███▍            | 218/1000 [1:08:43<1:40:45,  7.73s/it] 22%|███▌            | 219/1000 [1:08:51<1:40:29,  7.72s/it]                                                            {'loss': 2.5801, 'grad_norm': 0.7644863724708557, 'learning_rate': 0.00018202856841375518, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1021.95, 'epoch': 0.22}
- 22%|███▌            | 219/1000 [1:08:51<1:40:29,  7.72s/it] 22%|███▌            | 220/1000 [1:08:59<1:40:15,  7.71s/it]                                                            {'loss': 2.6876, 'grad_norm': 0.7600088119506836, 'learning_rate': 0.00018184289660636715, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1008.59, 'epoch': 0.22}
- 22%|███▌            | 220/1000 [1:08:59<1:40:15,  7.71s/it] 22%|███▌            | 221/1000 [1:09:06<1:39:59,  7.70s/it]                                                            {'loss': 2.4985, 'grad_norm': 0.8998281359672546, 'learning_rate': 0.0001816563663057211, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 676.03, 'epoch': 0.22}
- 22%|███▌            | 221/1000 [1:09:06<1:39:59,  7.70s/it] 22%|███▌            | 222/1000 [1:09:14<1:39:54,  7.70s/it]                                                            {'loss': 2.5881, 'grad_norm': 0.8680386543273926, 'learning_rate': 0.00018146897946843163, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 835.86, 'epoch': 0.22}
- 22%|███▌            | 222/1000 [1:09:14<1:39:54,  7.70s/it] 22%|███▌            | 223/1000 [1:09:22<1:39:44,  7.70s/it]                                                            {'loss': 2.744, 'grad_norm': 0.8678551316261292, 'learning_rate': 0.000181280738060098, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 803.41, 'epoch': 0.22}
- 22%|███▌            | 223/1000 [1:09:22<1:39:44,  7.70s/it] 22%|███▌            | 224/1000 [1:09:29<1:39:38,  7.70s/it]                                                            {'loss': 2.6716, 'grad_norm': 0.8682263493537903, 'learning_rate': 0.0001810916440552835, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 812.29, 'epoch': 0.22}
- 22%|███▌            | 224/1000 [1:09:30<1:39:38,  7.70s/it] 22%|███▌            | 225/1000 [1:09:37<1:39:23,  7.69s/it]                                                            {'loss': 2.3376, 'grad_norm': 0.8563072681427002, 'learning_rate': 0.00018090169943749476, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 698.99, 'epoch': 0.23}
- 22%|███▌            | 225/1000 [1:09:37<1:39:23,  7.69s/it] 23%|███▌            | 226/1000 [1:09:45<1:39:14,  7.69s/it]                                                            {'loss': 2.369, 'grad_norm': 0.7387136220932007, 'learning_rate': 0.00018071090619916093, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1010.19, 'epoch': 0.23}
- 23%|███▌            | 226/1000 [1:09:45<1:39:14,  7.69s/it] 23%|███▋            | 227/1000 [1:09:53<1:39:12,  7.70s/it]                                                            {'loss': 2.5163, 'grad_norm': 0.7362717986106873, 'learning_rate': 0.00018051926634161282, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1180.42, 'epoch': 0.23}
- 23%|███▋            | 227/1000 [1:09:53<1:39:12,  7.70s/it] 23%|███▋            | 228/1000 [1:10:00<1:38:55,  7.69s/it]                                                            {'loss': 2.5097, 'grad_norm': 1.0583449602127075, 'learning_rate': 0.00018032678187506187, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 682.1, 'epoch': 0.23}
- 23%|███▋            | 228/1000 [1:10:00<1:38:55,  7.69s/it] 23%|███▋            | 229/1000 [1:10:08<1:38:47,  7.69s/it]                                                            {'loss': 2.59, 'grad_norm': 0.7982457876205444, 'learning_rate': 0.00018013345481857903, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 956.84, 'epoch': 0.23}
- 23%|███▋            | 229/1000 [1:10:08<1:38:47,  7.69s/it] 23%|███▋            | 230/1000 [1:10:16<1:38:41,  7.69s/it]                                                            {'loss': 2.4428, 'grad_norm': 0.9278607368469238, 'learning_rate': 0.0001799392872000736, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 680.17, 'epoch': 0.23}
- 23%|███▋            | 230/1000 [1:10:16<1:38:41,  7.69s/it] 23%|███▋            | 231/1000 [1:10:23<1:38:32,  7.69s/it]                                                            {'loss': 2.4873, 'grad_norm': 0.8219959735870361, 'learning_rate': 0.00017974428105627208, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 809.78, 'epoch': 0.23}
- 23%|███▋            | 231/1000 [1:10:23<1:38:32,  7.69s/it] 23%|███▋            | 232/1000 [1:10:31<1:38:22,  7.69s/it]                                                            {'loss': 2.4619, 'grad_norm': 0.7974353432655334, 'learning_rate': 0.00017954843843269664, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 833.42, 'epoch': 0.23}
- 23%|███▋            | 232/1000 [1:10:31<1:38:22,  7.69s/it] 23%|███▋            | 233/1000 [1:10:39<1:38:24,  7.70s/it]                                                            {'loss': 2.4483, 'grad_norm': 0.7704808712005615, 'learning_rate': 0.0001793517613836437, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 911.44, 'epoch': 0.23}
- 23%|███▋            | 233/1000 [1:10:39<1:38:24,  7.70s/it] 23%|███▋            | 234/1000 [1:10:46<1:38:11,  7.69s/it]                                                            {'loss': 2.6092, 'grad_norm': 0.9593154191970825, 'learning_rate': 0.00017915425197216245, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 723.51, 'epoch': 0.23}
- 23%|███▋            | 234/1000 [1:10:46<1:38:11,  7.69s/it] 24%|███▊            | 235/1000 [1:10:54<1:37:48,  7.67s/it]                                                            {'loss': 2.4107, 'grad_norm': 0.7877375483512878, 'learning_rate': 0.00017895591227003315, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 806.81, 'epoch': 0.23}
- 24%|███▊            | 235/1000 [1:10:54<1:37:48,  7.67s/it] 24%|███▊            | 236/1000 [1:11:02<1:37:22,  7.65s/it]                                                            {'loss': 2.5001, 'grad_norm': 0.8339622020721436, 'learning_rate': 0.00017875674435774547, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 827.69, 'epoch': 0.24}
- 24%|███▊            | 236/1000 [1:11:02<1:37:22,  7.65s/it] 24%|███▊            | 237/1000 [1:11:09<1:37:13,  7.65s/it]                                                            {'loss': 2.298, 'grad_norm': 0.8007955551147461, 'learning_rate': 0.00017855675032447648, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 942.21, 'epoch': 0.24}
- 24%|███▊            | 237/1000 [1:11:09<1:37:13,  7.65s/it] 24%|███▊            | 238/1000 [1:11:17<1:37:01,  7.64s/it]                                                            {'loss': 2.5452, 'grad_norm': 0.8379720449447632, 'learning_rate': 0.00017835593226806903, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 823.16, 'epoch': 0.24}
- 24%|███▊            | 238/1000 [1:11:17<1:37:01,  7.64s/it] 24%|███▊            | 239/1000 [1:11:24<1:36:47,  7.63s/it]                                                            {'loss': 2.5321, 'grad_norm': 0.9570440053939819, 'learning_rate': 0.00017815429229500946, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 666.42, 'epoch': 0.24}
- 24%|███▊            | 239/1000 [1:11:24<1:36:47,  7.63s/it] 24%|███▊            | 240/1000 [1:11:32<1:36:34,  7.62s/it]                                                            {'loss': 2.417, 'grad_norm': 0.8553866147994995, 'learning_rate': 0.00017795183252040567, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 747.43, 'epoch': 0.24}
- 24%|███▊            | 240/1000 [1:11:32<1:36:34,  7.62s/it] 24%|███▊            | 241/1000 [1:11:40<1:36:28,  7.63s/it]                                                            {'loss': 2.6716, 'grad_norm': 0.8415744304656982, 'learning_rate': 0.00017774855506796496, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 800.47, 'epoch': 0.24}
- 24%|███▊            | 241/1000 [1:11:40<1:36:28,  7.63s/it] 24%|███▊            | 242/1000 [1:11:47<1:36:24,  7.63s/it]                                                            {'loss': 2.7438, 'grad_norm': 0.8283866047859192, 'learning_rate': 0.0001775444620699715, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 881.59, 'epoch': 0.24}
- 24%|███▊            | 242/1000 [1:11:47<1:36:24,  7.63s/it] 24%|███▉            | 243/1000 [1:11:55<1:36:13,  7.63s/it]                                                            {'loss': 2.7123, 'grad_norm': 0.946386992931366, 'learning_rate': 0.0001773395556672644, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 733.81, 'epoch': 0.24}
- 24%|███▉            | 243/1000 [1:11:55<1:36:13,  7.63s/it] 24%|███▉            | 244/1000 [1:12:03<1:36:06,  7.63s/it]                                                            {'loss': 2.4819, 'grad_norm': 0.7976366877555847, 'learning_rate': 0.00017713383800921478, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 917.12, 'epoch': 0.24}
- 24%|███▉            | 244/1000 [1:12:03<1:36:06,  7.63s/it][2025-10-18 20:15:11,342] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42528] Loading dataset: AiAF/conversations with base_type: chat_template and prompt_style: None
-[2025-10-18 20:15:11,343] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:42528] Using chat template:
+Tokenizing Prompts (num_proc=28):   0%|                                                                                                        | 0/76776 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=28):   1%|█▏                                                                                         | 1000/76776 [03:23<4:16:38,  4.92 examples/s]Tokenizing Prompts (num_proc=28):   3%|██▎                                                                                        | 2000/76776 [03:25<1:45:42, 11.79 examples/s]Tokenizing Prompts (num_proc=28):   3%|██▎                                                                                        | 2000/76776 [03:38<1:45:42, 11.79 examples/s]Tokenizing Prompts (num_proc=28):   4%|███▌                                                                                       | 3000/76776 [03:44<1:07:16, 18.28 examples/s]Tokenizing Prompts (num_proc=28):   4%|███▌                                                                                       | 3000/76776 [03:58<1:07:16, 18.28 examples/s]Tokenizing Prompts (num_proc=28):   5%|████▋                                                                                      | 4000/76776 [04:29<1:02:09, 19.51 examples/s]Tokenizing Prompts (num_proc=28):   5%|████▋                                                                                      | 4000/76776 [04:48<1:02:09, 19.51 examples/s]Tokenizing Prompts (num_proc=28):   7%|██████                                                                                       | 5000/76776 [05:05<54:28, 21.96 examples/s]Tokenizing Prompts (num_proc=28):   7%|██████                                                                                       | 5000/76776 [05:18<54:28, 21.96 examples/s]Tokenizing Prompts (num_proc=28):   8%|███████▎                                                                                     | 6000/76776 [05:50<53:43, 21.96 examples/s]Tokenizing Prompts (num_proc=28):   9%|████████▍                                                                                    | 7000/76776 [06:01<39:42, 29.29 examples/s]Tokenizing Prompts (num_proc=28):  10%|█████████▋                                                                                   | 8000/76776 [06:06<28:18, 40.49 examples/s]Tokenizing Prompts (num_proc=28):  10%|█████████▋                                                                                   | 8000/76776 [06:18<28:18, 40.49 examples/s]Tokenizing Prompts (num_proc=28):  12%|██████████▉                                                                                  | 9000/76776 [06:26<26:15, 43.02 examples/s]Tokenizing Prompts (num_proc=28):  13%|███████████▊                                                                                 | 9742/76776 [06:33<22:08, 50.46 examples/s]Tokenizing Prompts (num_proc=28):  13%|███████████▊                                                                                 | 9742/76776 [06:48<22:08, 50.46 examples/s]Tokenizing Prompts (num_proc=28):  14%|████████████▊                                                                               | 10742/76776 [06:57<23:26, 46.96 examples/s]Tokenizing Prompts (num_proc=28):  14%|████████████▊                                                                               | 10742/76776 [07:08<23:26, 46.96 examples/s]Tokenizing Prompts (num_proc=28):  15%|██████████████                                                                              | 11742/76776 [07:25<25:28, 42.55 examples/s]Tokenizing Prompts (num_proc=28):  15%|██████████████                                                                              | 11742/76776 [07:38<25:28, 42.55 examples/s]Tokenizing Prompts (num_proc=28):  17%|███████████████▎                                                                            | 12742/76776 [07:58<27:59, 38.12 examples/s]Tokenizing Prompts (num_proc=28):  17%|███████████████▎                                                                            | 12742/76776 [08:08<27:59, 38.12 examples/s]Tokenizing Prompts (num_proc=28):  18%|████████████████▏                                                                           | 13484/76776 [08:31<32:35, 32.37 examples/s]Tokenizing Prompts (num_proc=28):  19%|█████████████████▎                                                                          | 14484/76776 [08:46<26:43, 38.85 examples/s]Tokenizing Prompts (num_proc=28):  20%|██████████████████▌                                                                         | 15484/76776 [08:56<21:11, 48.20 examples/s]Tokenizing Prompts (num_proc=28):  20%|██████████████████▌                                                                         | 15484/76776 [09:08<21:11, 48.20 examples/s]Tokenizing Prompts (num_proc=28):  21%|███████████████████▍                                                                        | 16226/76776 [09:28<26:32, 38.02 examples/s]Tokenizing Prompts (num_proc=28):  21%|███████████████████▍                                                                        | 16226/76776 [09:48<26:32, 38.02 examples/s]Tokenizing Prompts (num_proc=28):  22%|████████████████████▋                                                                       | 17226/76776 [09:56<26:40, 37.20 examples/s]Tokenizing Prompts (num_proc=28):  24%|█████████████████████▊                                                                      | 18226/76776 [10:02<19:43, 49.46 examples/s]Tokenizing Prompts (num_proc=28):  25%|███████████████████████                                                                     | 19226/76776 [10:04<14:00, 68.48 examples/s]Tokenizing Prompts (num_proc=28):  26%|████████████████████████▏                                                                   | 20226/76776 [10:15<12:39, 74.45 examples/s]Tokenizing Prompts (num_proc=28):  28%|█████████████████████████▍                                                                  | 21226/76776 [10:19<09:45, 94.93 examples/s]Tokenizing Prompts (num_proc=28):  28%|█████████████████████████▍                                                                  | 21226/76776 [10:38<09:45, 94.93 examples/s]Tokenizing Prompts (num_proc=28):  29%|██████████████████████████▋                                                                 | 22226/76776 [11:35<27:52, 32.62 examples/s]Tokenizing Prompts (num_proc=28):  29%|██████████████████████████▋                                                                 | 22226/76776 [11:48<27:52, 32.62 examples/s]Tokenizing Prompts (num_proc=28):  30%|███████████████████████████▊                                                                | 23226/76776 [12:00<25:48, 34.58 examples/s]Tokenizing Prompts (num_proc=28):  30%|███████████████████████████▊                                                                | 23226/76776 [12:18<25:48, 34.58 examples/s]Tokenizing Prompts (num_proc=28):  32%|█████████████████████████████                                                               | 24226/76776 [12:35<26:45, 32.73 examples/s]Tokenizing Prompts (num_proc=28):  32%|█████████████████████████████                                                               | 24226/76776 [12:48<26:45, 32.73 examples/s]Tokenizing Prompts (num_proc=28):  33%|██████████████████████████████▏                                                             | 25226/76776 [13:40<35:18, 24.34 examples/s]Tokenizing Prompts (num_proc=28):  34%|███████████████████████████████▍                                                            | 26226/76776 [13:52<27:13, 30.94 examples/s]Tokenizing Prompts (num_proc=28):  37%|█████████████████████████████████▊                                                          | 28226/76776 [14:00<15:37, 51.79 examples/s]Tokenizing Prompts (num_proc=28):  38%|██████████████████████████████████▋                                                         | 28968/76776 [14:01<12:24, 64.18 examples/s]Tokenizing Prompts (num_proc=28):  38%|██████████████████████████████████▋                                                         | 28968/76776 [14:18<12:24, 64.18 examples/s]Tokenizing Prompts (num_proc=28):  39%|███████████████████████████████████▉                                                        | 29968/76776 [15:26<27:00, 28.88 examples/s]Tokenizing Prompts (num_proc=28):  39%|███████████████████████████████████▉                                                        | 29968/76776 [15:38<27:00, 28.88 examples/s]Tokenizing Prompts (num_proc=28):  40%|█████████████████████████████████████                                                       | 30968/76776 [15:42<22:26, 34.02 examples/s]Tokenizing Prompts (num_proc=28):  42%|██████████████████████████████████████▎                                                     | 31968/76776 [15:58<19:04, 39.16 examples/s]Tokenizing Prompts (num_proc=28):  43%|███████████████████████████████████████▌                                                    | 32968/76776 [16:07<15:09, 48.15 examples/s]Tokenizing Prompts (num_proc=28):  44%|████████████████████████████████████████▋                                                   | 33968/76776 [16:13<11:47, 60.53 examples/s]Tokenizing Prompts (num_proc=28):  46%|█████████████████████████████████████████▉                                                  | 34968/76776 [16:25<10:35, 65.75 examples/s]Tokenizing Prompts (num_proc=28):  46%|█████████████████████████████████████████▉                                                  | 34968/76776 [16:38<10:35, 65.75 examples/s]Tokenizing Prompts (num_proc=28):  47%|███████████████████████████████████████████                                                 | 35968/76776 [17:15<17:25, 39.04 examples/s]Tokenizing Prompts (num_proc=28):  47%|███████████████████████████████████████████                                                 | 35968/76776 [17:28<17:25, 39.04 examples/s]Tokenizing Prompts (num_proc=28):  48%|████████████████████████████████████████████▎                                               | 36968/76776 [17:36<15:54, 41.72 examples/s]Tokenizing Prompts (num_proc=28):  48%|████████████████████████████████████████████▎                                               | 36968/76776 [17:48<15:54, 41.72 examples/s]Tokenizing Prompts (num_proc=28):  49%|█████████████████████████████████████████████▍                                              | 37968/76776 [17:58<15:08, 42.72 examples/s]Tokenizing Prompts (num_proc=28):  49%|█████████████████████████████████████████████▍                                              | 37968/76776 [18:08<15:08, 42.72 examples/s]Tokenizing Prompts (num_proc=28):  51%|██████████████████████████████████████████████▋                                             | 38968/76776 [18:09<12:31, 50.31 examples/s]Tokenizing Prompts (num_proc=28):  52%|███████████████████████████████████████████████▌                                            | 39710/76776 [18:20<11:27, 53.95 examples/s]Tokenizing Prompts (num_proc=28):  52%|███████████████████████████████████████████████▌                                            | 39710/76776 [18:38<11:27, 53.95 examples/s]Tokenizing Prompts (num_proc=28):  53%|████████████████████████████████████████████████▊                                           | 40710/76776 [18:49<13:13, 45.43 examples/s]Tokenizing Prompts (num_proc=28):  53%|████████████████████████████████████████████████▊                                           | 40710/76776 [19:08<13:13, 45.43 examples/s]Tokenizing Prompts (num_proc=28):  54%|█████████████████████████████████████████████████▉                                          | 41710/76776 [19:09<12:31, 46.68 examples/s]Tokenizing Prompts (num_proc=28):  54%|█████████████████████████████████████████████████▉                                          | 41710/76776 [19:28<12:31, 46.68 examples/s]Tokenizing Prompts (num_proc=28):  56%|███████████████████████████████████████████████████▏                                        | 42710/76776 [19:33<12:27, 45.60 examples/s]Tokenizing Prompts (num_proc=28):  56%|███████████████████████████████████████████████████▏                                        | 42710/76776 [19:48<12:27, 45.60 examples/s]Tokenizing Prompts (num_proc=28):  57%|████████████████████████████████████████████████████                                        | 43452/76776 [20:00<14:14, 38.99 examples/s]Tokenizing Prompts (num_proc=28):  58%|█████████████████████████████████████████████████████▎                                      | 44452/76776 [20:02<09:42, 55.48 examples/s]Tokenizing Prompts (num_proc=28):  59%|██████████████████████████████████████████████████████▏                                     | 45194/76776 [20:15<09:20, 56.30 examples/s]Tokenizing Prompts (num_proc=28):  59%|██████████████████████████████████████████████████████▏                                     | 45194/76776 [20:28<09:20, 56.30 examples/s]Tokenizing Prompts (num_proc=28):  60%|███████████████████████████████████████████████████████▎                                    | 46194/76776 [22:11<25:36, 19.90 examples/s]Tokenizing Prompts (num_proc=28):  61%|████████████████████████████████████████████████████████▏                                   | 46936/76776 [22:18<19:47, 25.14 examples/s]Tokenizing Prompts (num_proc=28):  62%|██████████████████████████████████████████████���██████████▍                                  | 47936/76776 [22:19<12:47, 37.55 examples/s]Tokenizing Prompts (num_proc=28):  64%|██████████████████████████████████████████████████████████▋                                 | 48936/76776 [22:34<10:40, 43.48 examples/s]Tokenizing Prompts (num_proc=28):  64%|██████████████████████████████████████████████████████████▋                                 | 48936/76776 [22:48<10:40, 43.48 examples/s]Tokenizing Prompts (num_proc=28):  65%|███████████████████████████████████████████████████████████▊                                | 49936/76776 [22:51<09:29, 47.14 examples/s]Tokenizing Prompts (num_proc=28):  65%|███████████████████████████████████████████████████████████▊                                | 49936/76776 [23:08<09:29, 47.14 examples/s]Tokenizing Prompts (num_proc=28):  66%|█████████████████████████████████████████████████████████████                               | 50936/76776 [23:18<09:51, 43.65 examples/s]Tokenizing Prompts (num_proc=28):  66%|█████████████████████████████████████████████████████████████                               | 50936/76776 [23:38<09:51, 43.65 examples/s]Tokenizing Prompts (num_proc=28):  68%|██████████████████████████████████████████████████████████████▏                             | 51936/76776 [23:38<09:09, 45.17 examples/s]Tokenizing Prompts (num_proc=28):  69%|███████████████████████████████████████████████████████████████                             | 52678/76776 [23:39<06:47, 59.09 examples/s]Tokenizing Prompts (num_proc=28):  70%|████████████████████████████████████████████████████████████████▎                           | 53678/76776 [23:40<04:33, 84.48 examples/s]Tokenizing Prompts (num_proc=28):  70%|████████████████████████████████████████████████████████████████▎                           | 53678/76776 [23:58<04:33, 84.48 examples/s]Tokenizing Prompts (num_proc=28):  71%|█████████████████████████████████████████████████████████████████▌                          | 54678/76776 [24:19<07:32, 48.80 examples/s]Tokenizing Prompts (num_proc=28):  73%|██████████████████████████████████████████████████████████████████▋                         | 55678/76776 [24:35<06:41, 52.49 examples/s]Tokenizing Prompts (num_proc=28):  73%|██████████████████████████████████████████████████████████████████▋                         | 55678/76776 [24:48<06:41, 52.49 examples/s]Tokenizing Prompts (num_proc=28):  73%|███████████████████████████████████████████████████████████████████▌                        | 56420/76776 [26:14<16:01, 21.17 examples/s]Tokenizing Prompts (num_proc=28):  74%|████████████████████████████████████████████████████████████████████▍                       | 57162/76776 [26:15<11:33, 28.27 examples/s]Tokenizing Prompts (num_proc=28):  74%|████████████████████████████████████████████████████████████████████▍                       | 57162/76776 [26:28<11:33, 28.27 examples/s]Tokenizing Prompts (num_proc=28):  75%|█████████████████████████████████████████████████████████████████████▍                      | 57904/76776 [26:55<12:37, 24.90 examples/s]Tokenizing Prompts (num_proc=28):  76%|██████████████████████████████████████████████████████████████████████▎                     | 58646/76776 [26:55<08:48, 34.31 examples/s]Tokenizing Prompts (num_proc=28):  76%|██████████████████████████████████████████████████████████████████████▎                     | 58646/76776 [27:08<08:48, 34.31 examples/s]Tokenizing Prompts (num_proc=28):  77%|███████████████████████████████████████████████████████████████████████▏                    | 59388/76776 [27:08<07:28, 38.77 examples/s]Tokenizing Prompts (num_proc=28):  79%|████████████████████████████████████████████████████████████████████████▎                   | 60388/76776 [27:10<04:44, 57.65 examples/s]Tokenizing Prompts (num_proc=28):  79%|████████████████████████████████████████████████████████████████████████▎                   | 60388/76776 [27:28<04:44, 57.65 examples/s]Tokenizing Prompts (num_proc=28):  80%|█████████████████████████████████████████████████████████████████████████▎                  | 61130/76776 [29:09<14:43, 17.71 examples/s]Tokenizing Prompts (num_proc=28):  81%|██████████████████████████████████████████████████████████████████████████▍                 | 62130/76776 [30:04<13:37, 17.91 examples/s]Tokenizing Prompts (num_proc=28):  81%|██████████████████████████████████████████████████████████████████████████▍                 | 62130/76776 [30:18<13:37, 17.91 examples/s]Tokenizing Prompts (num_proc=28):  82%|███████████████████████████████████████████████████████████████████████████▋                | 63130/76776 [30:58<12:36, 18.05 examples/s]Tokenizing Prompts (num_proc=28):  84%|████████████████████████████████████████████████████████████████████████████▊               | 64130/76776 [31:12<08:51, 23.79 examples/s]Tokenizing Prompts (num_proc=28):  84%|█████████████████████████████████████████████████████████████████████████████▋              | 64872/76776 [31:17<06:31, 30.39 examples/s]Tokenizing Prompts (num_proc=28):  84%|█████████████████████████████████████████████████████████████████████████████▋              | 64872/76776 [31:28<06:31, 30.39 examples/s]Tokenizing Prompts (num_proc=28):  86%|██████████████████████████████████████████████████████████████████████████████▉             | 65872/76776 [31:57<06:26, 28.23 examples/s]Tokenizing Prompts (num_proc=28):  87%|███████████████████████████████████████████████████████████████████████████████▊            | 66614/76776 [32:00<04:37, 36.65 examples/s]Tokenizing Prompts (num_proc=28):  87%|███████████████████████████████████████████████████████████████████████████████▊            | 66614/76776 [32:18<04:37, 36.65 examples/s]Tokenizing Prompts (num_proc=28):  88%|█████████████████████████████████████████████████████████████████████████████████           | 67614/76776 [33:35<07:35, 20.12 examples/s]Tokenizing Prompts (num_proc=28):  89%|█████████████████████████████████████████████████████████████████████████████████▉          | 68356/76776 [33:38<05:20, 26.30 examples/s]Tokenizing Prompts (num_proc=28):  89%|█████████████████████████████████████████████████████████████████████████████████▉          | 68356/76776 [33:58<05:20, 26.30 examples/s]Tokenizing Prompts (num_proc=28):  90%|██████████████████████████████████████████████████████████████████████████████████▊         | 69098/76776 [34:51<06:57, 18.41 examples/s]Tokenizing Prompts (num_proc=28):  90%|██████████████████████████████████████████████████████████████████████████████████▊         | 69098/76776 [35:08<06:57, 18.41 examples/s]Tokenizing Prompts (num_proc=28):  91%|███████████████████████████████████████████████████████████████████████████████████▉        | 70098/76776 [36:02<06:40, 16.67 examples/s]Tokenizing Prompts (num_proc=28):  91%|███████████████████████████████████████████████████████████████████████████████████▉        | 70098/76776 [36:18<06:40, 16.67 examples/s]Tokenizing Prompts (num_proc=28):  92%|████████████████████████████████████████████████████████████████████████████████████▉       | 70840/76776 [36:21<05:01, 19.67 examples/s]Tokenizing Prompts (num_proc=28):  92%|████████████████████████████████████████████████████████████████████████████████████▉       | 70840/76776 [36:38<05:01, 19.67 examples/s]Tokenizing Prompts (num_proc=28):  93%|█████████████████████████████████████████████████████████████████████████████████████▊      | 71582/76776 [40:14<10:40,  8.10 examples/s]Tokenizing Prompts (num_proc=28):  94%|██████████████████████████████████████████████████████████████████████████████████████▋     | 72324/76776 [40:51<07:36,  9.76 examples/s]Tokenizing Prompts (num_proc=28):  94%|██████████████████████████████████████████████████████████████████████████████████████▋     | 72324/76776 [41:08<07:36,  9.76 examples/s]Tokenizing Prompts (num_proc=28):  95%|███████████████████████████████████████████████████████████████████████████████████████▌    | 73066/76776 [42:10<06:24,  9.66 examples/s]Tokenizing Prompts (num_proc=28):  96%|████████████████████████████████████████████████████████████████████████████████████████▍   | 73808/76776 [42:11<03:38, 13.55 examples/s]Tokenizing Prompts (num_proc=28):  96%|█████████████████████████████████████████████████████████████████████████████████���██████▍   | 73808/76776 [42:28<03:38, 13.55 examples/s]Tokenizing Prompts (num_proc=28):  97%|█████████████████████████████████████████████████████████████████████████████████████████▎  | 74550/76776 [43:34<03:09, 11.73 examples/s]Tokenizing Prompts (num_proc=28):  98%|██████████████████████████████████████████████████████████████████████████████████████████▏ | 75292/76776 [45:44<02:46,  8.94 examples/s]Tokenizing Prompts (num_proc=28):  99%|███████████████████████████████████████████████████████████████████████████████████████████ | 76034/76776 [52:40<03:01,  4.08 examples/s]Tokenizing Prompts (num_proc=28): 100%|████████████████████████████████████████████████████████████████████████████████████████████| 76776/76776 [55:36<00:00,  4.12 examples/s]Tokenizing Prompts (num_proc=28): 100%|████████████████████████████████████████████████████████████████████████████████████████████| 76776/76776 [55:37<00:00, 23.01 examples/s]
+[2026-03-30 14:34:06,524] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:37135] min_input_len: 79
+[2026-03-30 14:34:06,524] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:37135] max_input_len: 207535
+Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204
+[2026-03-30 14:34:12,077] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:37135] Dropped 60374 sequences outside valid range ([None, 2048])
+Saving the dataset (0/28 shards):   0%| |Saving the dataset (0/28 shards):   4%| |Saving the dataset (1/28 shards):   4%| |Saving the dataset (2/28 shards):   7%| |Saving the dataset (3/28 shards):  11%| |Saving the dataset (4/28 shards):  18%|▏|Saving the dataset (5/28 shards):  21%|▏|Saving the dataset (6/28 shards):  21%|▏|Saving the dataset (7/28 shards):  29%|▎|Saving the dataset (8/28 shards):  29%|▎|Saving the dataset (9/28 shards):  39%|▍|Saving the dataset (10/28 shards):  39%|▍Saving the dataset (11/28 shards):  43%|▍Saving the dataset (12/28 shards):  46%|▍Saving the dataset (13/28 shards):  46%|▍Saving the dataset (14/28 shards):  50%|▍Saving the dataset (15/28 shards):  54%|▌Saving the dataset (16/28 shards):  61%|▌Saving the dataset (17/28 shards):  61%|▌Saving the dataset (18/28 shards):  68%|▋Saving the dataset (19/28 shards):  68%|▋Saving the dataset (20/28 shards):  71%|▋Saving the dataset (21/28 shards):  75%|▋Saving the dataset (22/28 shards):  86%|▊Saving the dataset (23/28 shards):  89%|▉Saving the dataset (24/28 shards):  89%|▉Saving the dataset (25/28 shards):  89%|▉Saving the dataset (26/28 shards):  93%|▉Saving the dataset (27/28 shards):  96%|▉Saving the dataset (28/28 shards): 100%|█Saving the dataset (28/28 shards): 100%|█
+[2026-03-30 14:34:15,379] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:37135] Unable to find prepared dataset in /workspace/axolotl/last_run_prepared/df80f313c04db5e542fa25408a23272d
+[2026-03-30 14:34:15,379] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:37135] Loading raw datasets...
+[2026-03-30 14:34:15,380] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:37135] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
+[2026-03-30 14:34:15,673] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:37135] Loading dataset: eval-datasets/shuf-1000_conversations_V3.jsonl with base_type: chat_template and prompt_style: None
+[2026-03-30 14:34:15,674] [INFO] [axolotl.prompt_strategies.chat_template.__call__:998] [PID:37135] Using chat template:
 ---
 {{ bos_token }}
 {% for m in messages %}
@@ -1476,4157 +254,215 @@ trainable params: 83,066,880 || all params: 2,697,408,768 || trainable%: 3.0795
 {% endif %}
 
 ---
-
-Tokenizing Prompts (num_proc=12):   0%| | 0/10000 [00:00<?, [A
-Tokenizing Prompts (num_proc=12):   8%| | 834/10000 [14:03<2[A
-Tokenizing Prompts (num_proc=12):  17%|▏| 1667/10000 [14:15<[A
-Tokenizing Prompts (num_proc=12):  25%|▎| 2501/10000 [17:21<[A
-Tokenizing Prompts (num_proc=12):  33%|▎| 3334/10000 [19:14<[A
-Tokenizing Prompts (num_proc=12):  42%|▍| 4167/10000 [20:16<[A
-Tokenizing Prompts (num_proc=12):  50%|▌| 5000/10000 [20:22<[A
-Tokenizing Prompts (num_proc=12):  58%|▌| 5833/10000 [21:36<[A
-Tokenizing Prompts (num_proc=12):  67%|▋| 6667/10000 [21:48<[A
-Tokenizing Prompts (num_proc=12):  75%|▊| 7501/10000 [22:18<[A
-Tokenizing Prompts (num_proc=12):  83%|▊| 8334/10000 [26:42<[A
-Tokenizing Prompts (num_proc=12):  92%|▉| 9167/10000 [28:52<[A
-Tokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [34:39[ATokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [34:42
-
-Dropping Long Sequences:   0%| | 0/10000 [00:00<?, ? example[A
-Dropping Long Sequences:  10%| | 1000/10000 [00:09<01:24, 10[A
-Dropping Long Sequences:  20%|▏| 2000/10000 [00:16<01:03, 12[A
-Dropping Long Sequences:  30%|▎| 3000/10000 [00:23<00:53, 13[A
-Dropping Long Sequences:  40%|▍| 4000/10000 [00:31<00:46, 13[A
-Dropping Long Sequences:  50%|▌| 5000/10000 [00:38<00:38, 13[A
-Dropping Long Sequences:  60%|▌| 6000/10000 [00:46<00:31, 12[A
-Dropping Long Sequences:  70%|▋| 7000/10000 [00:54<00:23, 12[A
-Dropping Long Sequences:  80%|▊| 8000/10000 [01:02<00:15, 13[A
-Dropping Long Sequences:  90%|▉| 9000/10000 [01:09<00:07, 12[A
-Dropping Long Sequences: 100%|█| 10000/10000 [01:18<00:00, 1[ADropping Long Sequences: 100%|█| 10000/10000 [01:18<00:00, 1
-
-Add position_id column (Pretraining Sample Packing):   0%| |[A
-Add position_id column (Pretraining Sample Packing):  47%|▍|[A
-Add position_id column (Pretraining Sample Packing):  95%|▉|[A
-Add position_id column (Pretraining Sample Packing): 100%|█|[AAdd position_id column (Pretraining Sample Packing): 100%|█|
-[2025-10-18 20:51:19,446] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42528] Using single process for pack_parallel, running sequentially.
- 24%|███▏         | 245/1000 [1:48:49<139:56:20, 667.26s/it]                                                            {'loss': 2.7273, 'grad_norm': 1.0437068939208984, 'learning_rate': 0.00017692731125370354, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 677.38, 'epoch': 0.24}
- 24%|███▏         | 245/1000 [1:48:49<139:56:20, 667.26s/it] 25%|███▍          | 246/1000 [1:48:56<98:17:40, 469.31s/it]                                                            {'loss': 2.5675, 'grad_norm': 0.7666839957237244, 'learning_rate': 0.00017671997756709863, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 865.48, 'epoch': 0.25}
- 25%|███▍          | 246/1000 [1:48:56<98:17:40, 469.31s/it] 25%|███▍          | 247/1000 [1:49:04<69:10:50, 330.74s/it]                                                            {'loss': 2.3506, 'grad_norm': 0.8079777956008911, 'learning_rate': 0.00017651183912423228, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 782.18, 'epoch': 0.25}
- 25%|███▍          | 247/1000 [1:49:04<69:10:50, 330.74s/it] 25%|███▍          | 248/1000 [1:49:11<48:49:56, 233.77s/it]                                                            {'loss': 2.4257, 'grad_norm': 0.6931099891662598, 'learning_rate': 0.00017630289810837834, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1159.78, 'epoch': 0.25}
- 25%|███▍          | 248/1000 [1:49:11<48:49:56, 233.77s/it] 25%|███▍          | 249/1000 [1:49:19<34:36:25, 165.89s/it]                                                            {'loss': 2.4966, 'grad_norm': 0.8468870520591736, 'learning_rate': 0.0001760931567112291, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 844.58, 'epoch': 0.25}
- 25%|███▍          | 249/1000 [1:49:19<34:36:25, 165.89s/it] 25%|███▌          | 250/1000 [1:49:26<24:39:47, 118.38s/it]                                                            {'loss': 2.4428, 'grad_norm': 0.7643404006958008, 'learning_rate': 0.00017588261713287267, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 965.55, 'epoch': 0.25}
- 25%|███▌          | 250/1000 [1:49:26<24:39:47, 118.38s/it][2025-10-18 20:52:13,521] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 20:52:16,505] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4350557327270508
-[2025-10-18 20:52:17,942] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4370789527893066
-[2025-10-18 20:52:19,376] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4332811832427979
-[2025-10-18 20:52:20,778] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.402167558670044
-[2025-10-18 20:52:20,778] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:27,  6.35it/s][A
-  2%|▍                      | 3/179 [00:00<00:40,  4.36it/s][A
-  2%|▌                      | 4/179 [00:00<00:45,  3.81it/s][A
-  3%|▋                      | 5/179 [00:01<01:17,  2.24it/s][A
-  3%|▊                      | 6/179 [00:02<01:07,  2.56it/s][A
-  4%|▉                      | 7/179 [00:02<01:03,  2.72it/s][A
-  4%|█                      | 8/179 [00:02<01:00,  2.83it/s][A
-  5%|█▏                     | 9/179 [00:03<01:09,  2.46it/s][A
-  6%|█▏                    | 10/179 [00:03<01:02,  2.70it/s][A
-  6%|█▎                    | 11/179 [00:03<00:59,  2.81it/s][A
-  7%|█▍                    | 12/179 [00:04<00:57,  2.91it/s][A
-  7%|█▌                    | 13/179 [00:04<01:06,  2.50it/s][A
-  8%|█▋                    | 14/179 [00:04<01:00,  2.74it/s][A
-  8%|█▊                    | 15/179 [00:05<00:57,  2.84it/s][A
-  9%|█▉                    | 16/179 [00:05<00:55,  2.92it/s][A
-  9%|██                    | 17/179 [00:06<01:04,  2.52it/s][A
- 10%|██▏                   | 18/179 [00:06<00:58,  2.74it/s][A
- 11%|██▎                   | 19/179 [00:06<00:56,  2.84it/s][A
- 11%|██▍                   | 20/179 [00:07<00:54,  2.93it/s][A
- 12%|██▌                   | 21/179 [00:07<01:02,  2.51it/s][A
- 12%|██▋                   | 22/179 [00:07<00:57,  2.74it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.82it/s][A
- 13%|██▉                   | 24/179 [00:08<00:53,  2.92it/s][A
- 14%|███                   | 25/179 [00:09<01:01,  2.50it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.72it/s][A
- 15%|███▎                  | 27/179 [00:09<00:53,  2.84it/s][A
- 16%|███▍                  | 28/179 [00:09<00:51,  2.91it/s][A
- 16%|███▌                  | 29/179 [00:10<00:59,  2.52it/s][A
- 17%|███▋                  | 30/179 [00:10<00:54,  2.74it/s][A
- 17%|███▊                  | 31/179 [00:11<00:51,  2.85it/s][A
- 18%|███▉                  | 32/179 [00:11<00:50,  2.93it/s][A
- 18%|████                  | 33/179 [00:11<00:57,  2.52it/s][A
- 19%|████▏                 | 34/179 [00:12<00:52,  2.76it/s][A
- 20%|████▎                 | 35/179 [00:12<00:50,  2.86it/s][A
- 20%|████▍                 | 36/179 [00:12<00:48,  2.94it/s][A
- 21%|████▌                 | 37/179 [00:13<00:56,  2.53it/s][A
- 21%|████▋                 | 38/179 [00:13<00:51,  2.76it/s][A
- 22%|████▊                 | 39/179 [00:13<00:48,  2.86it/s][A
- 22%|████▉                 | 40/179 [00:14<00:47,  2.94it/s][A
- 23%|█████                 | 41/179 [00:14<00:58,  2.36it/s][A
- 23%|█████▏                | 42/179 [00:15<00:52,  2.63it/s][A
- 24%|█████▎                | 43/179 [00:15<00:49,  2.76it/s][A
- 25%|█████▍                | 44/179 [00:15<00:47,  2.86it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.48it/s][A
- 26%|█████▋                | 46/179 [00:16<00:48,  2.72it/s][A
- 26%|█████▊                | 47/179 [00:16<00:46,  2.82it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.90it/s][A
- 27%|██████                | 49/179 [00:17<00:52,  2.49it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.73it/s][A
- 28%|██████▎               | 51/179 [00:18<00:44,  2.85it/s][A
- 29%|██████▍               | 52/179 [00:18<00:43,  2.92it/s][A
- 30%|██████▌               | 53/179 [00:19<00:50,  2.51it/s][A
- 30%|██████▋               | 54/179 [00:19<00:45,  2.73it/s][A
- 31%|██████▊               | 55/179 [00:19<00:43,  2.83it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.91it/s][A
- 32%|███████               | 57/179 [00:20<00:48,  2.50it/s][A
- 32%|███████▏              | 58/179 [00:21<00:44,  2.73it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.84it/s][A
- 34%|███████▎              | 60/179 [00:21<00:40,  2.91it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.50it/s][A
- 35%|███████▌              | 62/179 [00:22<00:42,  2.73it/s][A
- 35%|███████▋              | 63/179 [00:22<00:40,  2.83it/s][A
- 36%|███████▊              | 64/179 [00:23<00:39,  2.92it/s][A
- 36%|███████▉              | 65/179 [00:23<00:45,  2.52it/s][A
- 37%|████████              | 66/179 [00:23<00:41,  2.75it/s][A
- 37%|████████▏             | 67/179 [00:24<00:39,  2.85it/s][A
- 38%|████████▎             | 68/179 [00:24<00:37,  2.94it/s][A
- 39%|████████▍             | 69/179 [00:25<00:43,  2.52it/s][A
- 39%|████████▌             | 70/179 [00:25<00:39,  2.74it/s][A
- 40%|████████▋             | 71/179 [00:25<00:37,  2.85it/s][A
- 40%|████████▊             | 72/179 [00:26<00:36,  2.93it/s][A
- 41%|████████▉             | 73/179 [00:26<00:42,  2.52it/s][A
- 41%|█████████             | 74/179 [00:26<00:38,  2.73it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:36,  2.85it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.93it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:42,  2.43it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:38,  2.65it/s][A
- 44%|█████████▋            | 79/179 [00:28<00:36,  2.77it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.86it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|██████████            | 82/179 [00:29<00:35,  2.72it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:33,  2.82it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:32,  2.90it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:37,  2.48it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.71it/s][A
- 49%|██████████▋           | 87/179 [00:31<00:32,  2.81it/s][A
- 49%|██████████▊           | 88/179 [00:31<00:31,  2.90it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:35,  2.50it/s][A
- 50%|███████████           | 90/179 [00:32<00:32,  2.73it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.83it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:29,  2.91it/s][A
- 52%|███████████▍          | 93/179 [00:33<00:34,  2.50it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.72it/s][A
- 53%|███████████▋          | 95/179 [00:34<00:29,  2.84it/s][A
- 54%|███████████▊          | 96/179 [00:34<00:28,  2.91it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:32,  2.51it/s][A
- 55%|████████████          | 98/179 [00:35<00:29,  2.73it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.83it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.90it/s][A
- 56%|███████████▊         | 101/179 [00:36<00:31,  2.50it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.73it/s][A
- 58%|████████████         | 103/179 [00:37<00:26,  2.84it/s][A
- 58%|████████████▏        | 104/179 [00:37<00:25,  2.91it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:29,  2.49it/s][A
- 59%|████████████▍        | 106/179 [00:38<00:26,  2.72it/s][A
- 60%|████████████▌        | 107/179 [00:38<00:25,  2.82it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.89it/s][A
- 61%|████████████▊        | 109/179 [00:39<00:28,  2.49it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.73it/s][A
- 62%|█████████████        | 111/179 [00:40<00:23,  2.84it/s][A
- 63%|█████████████▏       | 112/179 [00:40<00:23,  2.89it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.49it/s][A
- 64%|█████████████▎       | 114/179 [00:41<00:24,  2.70it/s][A
- 64%|█████████████▍       | 115/179 [00:41<00:22,  2.82it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:21,  2.89it/s][A
- 65%|█████████████▋       | 117/179 [00:42<00:25,  2.48it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.70it/s][A
- 66%|█████████████▉       | 119/179 [00:43<00:21,  2.80it/s][A
- 67%|██████████████       | 120/179 [00:43<00:20,  2.89it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.49it/s][A
- 68%|██████████████▎      | 122/179 [00:44<00:20,  2.72it/s][A
- 69%|██████████████▍      | 123/179 [00:44<00:19,  2.83it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:18,  2.90it/s][A
- 70%|██████████████▋      | 125/179 [00:45<00:21,  2.50it/s][A
- 70%|██████████████▊      | 126/179 [00:45<00:19,  2.71it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.81it/s][A
- 72%|███████████████      | 128/179 [00:46<00:17,  2.89it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.49it/s][A
- 73%|███████████████▎     | 130/179 [00:47<00:17,  2.72it/s][A
- 73%|███████████████▎     | 131/179 [00:47<00:16,  2.83it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.91it/s][A
- 74%|███████████████▌     | 133/179 [00:48<00:18,  2.50it/s][A
- 75%|███████████████▋     | 134/179 [00:48<00:16,  2.73it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.83it/s][A
- 76%|███████████████▉     | 136/179 [00:49<00:14,  2.91it/s][A
- 77%|████████████████     | 137/179 [00:50<00:16,  2.49it/s][A
- 77%|████████████████▏    | 138/179 [00:50<00:15,  2.71it/s][A
- 78%|████████████████▎    | 139/179 [00:50<00:14,  2.81it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|████████████████▌    | 141/179 [00:51<00:15,  2.47it/s][A
- 79%|████████████████▋    | 142/179 [00:51<00:13,  2.70it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.81it/s][A
- 80%|████████████████▉    | 144/179 [00:52<00:12,  2.88it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.48it/s][A
- 82%|█████████████████▏   | 146/179 [00:53<00:12,  2.71it/s][A
- 82%|█████████████████▏   | 147/179 [00:53<00:11,  2.83it/s][A
- 83%|█████████████████▎   | 148/179 [00:53<00:10,  2.90it/s][A
- 83%|█████████████████▍   | 149/179 [00:54<00:12,  2.50it/s][A
- 84%|█████████████████▌   | 150/179 [00:54<00:10,  2.72it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:09,  2.83it/s][A
- 85%|█████████████████▊   | 152/179 [00:55<00:09,  2.89it/s][A
- 85%|█████████████████▉   | 153/179 [00:55<00:10,  2.49it/s][A
- 86%|██████████████████   | 154/179 [00:56<00:09,  2.72it/s][A
- 87%|██████████████████▏  | 155/179 [00:56<00:08,  2.83it/s][A
- 87%|██████████████████▎  | 156/179 [00:56<00:07,  2.90it/s][A
- 88%|██████████████████▍  | 157/179 [00:57<00:08,  2.48it/s][A
- 88%|██████████████████▌  | 158/179 [00:57<00:07,  2.71it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.83it/s][A
- 89%|██████████████████▊  | 160/179 [00:58<00:06,  2.91it/s][A
- 90%|██████████████████▉  | 161/179 [00:58<00:07,  2.49it/s][A
- 91%|███████████████████  | 162/179 [00:59<00:06,  2.72it/s][A
- 91%|███████████████████  | 163/179 [00:59<00:05,  2.83it/s][A
- 92%|███████████████████▏ | 164/179 [00:59<00:05,  2.91it/s][A
- 92%|███████████████████▎ | 165/179 [01:00<00:05,  2.51it/s][A
- 93%|███████████████████▍ | 166/179 [01:00<00:04,  2.72it/s][A
- 93%|███████████████████▌ | 167/179 [01:00<00:04,  2.82it/s][A
- 94%|███████████████████▋ | 168/179 [01:01<00:03,  2.91it/s][A
- 94%|███████████████████▊ | 169/179 [01:01<00:04,  2.50it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.73it/s][A
- 96%|████████████████████ | 171/179 [01:02<00:02,  2.82it/s][A
- 96%|████████████████████▏| 172/179 [01:02<00:02,  2.90it/s][A
- 97%|████████████████████▎| 173/179 [01:03<00:02,  2.50it/s][A
- 97%|████████████████████▍| 174/179 [01:03<00:01,  2.72it/s][A
- 98%|████████████████████▌| 175/179 [01:03<00:01,  2.83it/s][A
- 98%|████████████████████▋| 176/179 [01:04<00:01,  2.89it/s][A
- 99%|████████████████████▊| 177/179 [01:04<00:00,  2.49it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.72it/s][A
-100%|█████████████████████| 179/179 [01:05<00:00,  2.62it/s][A                                                            
-                                                            [A{'eval_loss': 2.4961862564086914, 'eval_runtime': 67.6115, 'eval_samples_per_second': 2.899, 'eval_steps_per_second': 1.449, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.25}
- 25%|███▌          | 250/1000 [1:50:41<24:39:47, 118.38s/it]
-100%|█████████████████████| 179/179 [01:05<00:00,  2.62it/s][A
-                                                            [A[2025-10-18 20:53:28,396] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-250
- 25%|███▌          | 251/1000 [1:50:52<22:35:24, 108.58s/it]                                                            {'loss': 2.4839, 'grad_norm': 0.874302864074707, 'learning_rate': 0.00017567128158176953, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 718.64, 'epoch': 0.25}
- 25%|███▌          | 251/1000 [1:50:52<22:35:24, 108.58s/it] 25%|███▊           | 252/1000 [1:51:00<16:16:11, 78.30s/it]                                                            {'loss': 2.7131, 'grad_norm': 0.9815873503684998, 'learning_rate': 0.00017545915227472965, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 831.96, 'epoch': 0.25}
- 25%|███▊           | 252/1000 [1:51:00<16:16:11, 78.30s/it] 25%|███▊           | 253/1000 [1:51:07<11:50:54, 57.10s/it]                                                            {'loss': 2.4465, 'grad_norm': 0.799092173576355, 'learning_rate': 0.00017524623143688902, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 928.76, 'epoch': 0.25}
- 25%|███▊           | 253/1000 [1:51:07<11:50:54, 57.10s/it] 25%|████            | 254/1000 [1:51:15<8:45:38, 42.28s/it]                                                            {'loss': 2.4025, 'grad_norm': 0.8644756078720093, 'learning_rate': 0.00017503252130168657, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 703.64, 'epoch': 0.25}
- 25%|████            | 254/1000 [1:51:15<8:45:38, 42.28s/it] 26%|████            | 255/1000 [1:51:23<6:36:07, 31.90s/it]                                                            {'loss': 2.7525, 'grad_norm': 0.7470369338989258, 'learning_rate': 0.00017481802411084042, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1115.38, 'epoch': 0.26}
- 26%|████            | 255/1000 [1:51:23<6:36:07, 31.90s/it] 26%|████            | 256/1000 [1:51:30<5:05:25, 24.63s/it]                                                            {'loss': 2.4773, 'grad_norm': 0.7187418937683105, 'learning_rate': 0.0001746027421143246, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1015.39, 'epoch': 0.26}
- 26%|████            | 256/1000 [1:51:30<5:05:25, 24.63s/it] 26%|████            | 257/1000 [1:51:38<4:02:05, 19.55s/it]                                                            {'loss': 2.4191, 'grad_norm': 0.6954782009124756, 'learning_rate': 0.00017438667757034546, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1123.86, 'epoch': 0.26}
- 26%|████            | 257/1000 [1:51:38<4:02:05, 19.55s/it] 26%|████▏           | 258/1000 [1:51:46<3:17:42, 15.99s/it]                                                            {'loss': 2.4048, 'grad_norm': 0.8372341394424438, 'learning_rate': 0.00017416983274531775, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 817.6, 'epoch': 0.26}
- 26%|████▏           | 258/1000 [1:51:46<3:17:42, 15.99s/it] 26%|████▏           | 259/1000 [1:51:53<2:46:41, 13.50s/it]                                                            {'loss': 2.6038, 'grad_norm': 0.7202888131141663, 'learning_rate': 0.0001739522099138411, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1184.68, 'epoch': 0.26}
- 26%|████▏           | 259/1000 [1:51:53<2:46:41, 13.50s/it] 26%|████▏           | 260/1000 [1:52:01<2:25:01, 11.76s/it]                                                            {'loss': 2.6101, 'grad_norm': 0.7800538539886475, 'learning_rate': 0.00017373381135867604, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1041.24, 'epoch': 0.26}
- 26%|████▏           | 260/1000 [1:52:01<2:25:01, 11.76s/it] 26%|████▏           | 261/1000 [1:52:09<2:09:53, 10.55s/it]                                                            {'loss': 2.4472, 'grad_norm': 0.7340859174728394, 'learning_rate': 0.00017351463937072004, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 982.49, 'epoch': 0.26}
- 26%|████▏           | 261/1000 [1:52:09<2:09:53, 10.55s/it] 26%|████▏           | 262/1000 [1:52:17<1:59:11,  9.69s/it]                                                            {'loss': 2.7, 'grad_norm': 0.6798796653747559, 'learning_rate': 0.0001732946962489836, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1386.4, 'epoch': 0.26}
- 26%|████▏           | 262/1000 [1:52:17<1:59:11,  9.69s/it] 26%|████▏           | 263/1000 [1:52:24<1:51:43,  9.10s/it]                                                            {'loss': 2.5642, 'grad_norm': 0.6675375699996948, 'learning_rate': 0.00017307398430056593, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1204.06, 'epoch': 0.26}
- 26%|████▏           | 263/1000 [1:52:24<1:51:43,  9.10s/it] 26%|████▏           | 264/1000 [1:52:32<1:46:32,  8.69s/it]                                                            {'loss': 2.56, 'grad_norm': 0.7139641046524048, 'learning_rate': 0.000172852505840631, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1066.1, 'epoch': 0.26}
- 26%|████▏           | 264/1000 [1:52:32<1:46:32,  8.69s/it] 26%|████▏           | 265/1000 [1:52:40<1:42:53,  8.40s/it]                                                            {'loss': 2.8196, 'grad_norm': 0.6505297422409058, 'learning_rate': 0.00017263026319238301, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1429.89, 'epoch': 0.27}
- 26%|████▏           | 265/1000 [1:52:40<1:42:53,  8.40s/it] 27%|████▎           | 266/1000 [1:52:47<1:40:07,  8.18s/it]                                                            {'loss': 2.4539, 'grad_norm': 0.8675104975700378, 'learning_rate': 0.00017240725868704218, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 640.74, 'epoch': 0.27}
- 27%|████▎           | 266/1000 [1:52:47<1:40:07,  8.18s/it] 27%|████▎           | 267/1000 [1:52:55<1:38:12,  8.04s/it]                                                            {'loss': 2.7327, 'grad_norm': 0.8988902568817139, 'learning_rate': 0.00017218349466382023, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 680.6, 'epoch': 0.27}
- 27%|████▎           | 267/1000 [1:52:55<1:38:12,  8.04s/it] 27%|████▎           | 268/1000 [1:53:03<1:36:52,  7.94s/it]                                                            {'loss': 2.763, 'grad_norm': 0.7642865777015686, 'learning_rate': 0.0001719589734698959, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1097.58, 'epoch': 0.27}
- 27%|████▎           | 268/1000 [1:53:03<1:36:52,  7.94s/it] 27%|████▎           | 269/1000 [1:53:11<1:35:59,  7.88s/it]                                                            {'loss': 2.5095, 'grad_norm': 0.7225785255432129, 'learning_rate': 0.00017173369746039025, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1155.1, 'epoch': 0.27}
- 27%|████▎           | 269/1000 [1:53:11<1:35:59,  7.88s/it] 27%|████▎           | 270/1000 [1:53:18<1:35:17,  7.83s/it]                                                            {'loss': 2.6662, 'grad_norm': 0.8746216297149658, 'learning_rate': 0.00017150766899834204, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 822.42, 'epoch': 0.27}
- 27%|████▎           | 270/1000 [1:53:18<1:35:17,  7.83s/it] 27%|████▎           | 271/1000 [1:53:26<1:34:36,  7.79s/it]                                                            {'loss': 2.7887, 'grad_norm': 0.9155292510986328, 'learning_rate': 0.00017128089045468294, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 699.72, 'epoch': 0.27}
- 27%|████▎           | 271/1000 [1:53:26<1:34:36,  7.79s/it] 27%|████▎           | 272/1000 [1:53:34<1:34:16,  7.77s/it]                                                            {'loss': 2.6673, 'grad_norm': 0.6728147268295288, 'learning_rate': 0.00017105336420821247, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1271.15, 'epoch': 0.27}
- 27%|████▎           | 272/1000 [1:53:34<1:34:16,  7.77s/it] 27%|████▎           | 273/1000 [1:53:41<1:33:55,  7.75s/it]                                                            {'loss': 2.4617, 'grad_norm': 0.6885935664176941, 'learning_rate': 0.0001708250926455733, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1173.22, 'epoch': 0.27}
- 27%|████▎           | 273/1000 [1:53:41<1:33:55,  7.75s/it] 27%|████▍           | 274/1000 [1:53:49<1:33:38,  7.74s/it]                                                            {'loss': 2.6354, 'grad_norm': 0.7414568662643433, 'learning_rate': 0.00017059607816122618, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1022.1, 'epoch': 0.27}
- 27%|████▍           | 274/1000 [1:53:49<1:33:38,  7.74s/it] 28%|████▍           | 275/1000 [1:53:57<1:33:22,  7.73s/it]                                                            {'loss': 2.6424, 'grad_norm': 0.7236939072608948, 'learning_rate': 0.00017036632315742462, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1078.36, 'epoch': 0.28}
- 28%|████▍           | 275/1000 [1:53:57<1:33:22,  7.73s/it] 28%|████▍           | 276/1000 [1:54:05<1:33:10,  7.72s/it]                                                            {'loss': 2.6886, 'grad_norm': 0.8048384785652161, 'learning_rate': 0.00017013583004418993, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 907.05, 'epoch': 0.28}
- 28%|████▍           | 276/1000 [1:54:05<1:33:10,  7.72s/it] 28%|████▍           | 277/1000 [1:54:12<1:33:03,  7.72s/it]                                                            {'loss': 2.4551, 'grad_norm': 0.9392925500869751, 'learning_rate': 0.00016990460123928575, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 635.68, 'epoch': 0.28}
- 28%|████▍           | 277/1000 [1:54:12<1:33:03,  7.72s/it] 28%|████▍           | 278/1000 [1:54:20<1:32:55,  7.72s/it]                                                            {'loss': 2.43, 'grad_norm': 0.7178683280944824, 'learning_rate': 0.00016967263916819287, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1187.58, 'epoch': 0.28}
- 28%|████▍           | 278/1000 [1:54:20<1:32:55,  7.72s/it] 28%|████▍           | 279/1000 [1:54:28<1:32:53,  7.73s/it]                                                            {'loss': 2.484, 'grad_norm': 0.69338059425354, 'learning_rate': 0.00016943994626408363, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1233.55, 'epoch': 0.28}
- 28%|████▍           | 279/1000 [1:54:28<1:32:53,  7.73s/it] 28%|████▍           | 280/1000 [1:54:35<1:32:46,  7.73s/it]                                                            {'loss': 2.7441, 'grad_norm': 0.7821597456932068, 'learning_rate': 0.0001692065249677965, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1106.84, 'epoch': 0.28}
- 28%|████▍           | 280/1000 [1:54:35<1:32:46,  7.73s/it] 28%|████▍           | 281/1000 [1:54:43<1:32:36,  7.73s/it]                                                            {'loss': 2.4182, 'grad_norm': 0.8302145600318909, 'learning_rate': 0.00016897237772781044, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 868.62, 'epoch': 0.28}
- 28%|████▍           | 281/1000 [1:54:43<1:32:36,  7.73s/it] 28%|████▌           | 282/1000 [1:54:51<1:32:22,  7.72s/it]                                                            {'loss': 2.669, 'grad_norm': 0.9646657109260559, 'learning_rate': 0.00016873750700021915, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 608.35, 'epoch': 0.28}
- 28%|████▌           | 282/1000 [1:54:51<1:32:22,  7.72s/it] 28%|████▌           | 283/1000 [1:54:59<1:32:13,  7.72s/it]                                                            {'loss': 2.559, 'grad_norm': 0.8006969094276428, 'learning_rate': 0.00016850191524870546, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 856.45, 'epoch': 0.28}
- 28%|████▌           | 283/1000 [1:54:59<1:32:13,  7.72s/it] 28%|████▌           | 284/1000 [1:55:06<1:32:02,  7.71s/it]                                                            {'loss': 2.5713, 'grad_norm': 0.7698496580123901, 'learning_rate': 0.00016826560494451537, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1000.91, 'epoch': 0.28}
- 28%|████▌           | 284/1000 [1:55:06<1:32:02,  7.71s/it] 28%|████▌           | 285/1000 [1:55:14<1:31:54,  7.71s/it]                                                            {'loss': 2.5755, 'grad_norm': 0.7082530856132507, 'learning_rate': 0.00016802857856643215, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1179.94, 'epoch': 0.28}
- 28%|████▌           | 285/1000 [1:55:14<1:31:54,  7.71s/it] 29%|████▌           | 286/1000 [1:55:22<1:31:47,  7.71s/it]                                                            {'loss': 2.4492, 'grad_norm': 0.6993674039840698, 'learning_rate': 0.00016779083860075033, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1135.1, 'epoch': 0.29}
- 29%|████▌           | 286/1000 [1:55:22<1:31:47,  7.71s/it] 29%|████▌           | 287/1000 [1:55:29<1:31:34,  7.71s/it]                                                            {'loss': 2.7165, 'grad_norm': 0.7869941592216492, 'learning_rate': 0.00016755238754124965, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1003.54, 'epoch': 0.29}
- 29%|████▌           | 287/1000 [1:55:29<1:31:34,  7.71s/it] 29%|████▌           | 288/1000 [1:55:37<1:31:22,  7.70s/it]                                                            {'loss': 2.5965, 'grad_norm': 0.7975780367851257, 'learning_rate': 0.00016731322788916892, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 884.6, 'epoch': 0.29}
- 29%|████▌           | 288/1000 [1:55:37<1:31:22,  7.70s/it] 29%|████▌           | 289/1000 [1:55:45<1:31:12,  7.70s/it]                                                            {'loss': 2.5957, 'grad_norm': 0.8249726891517639, 'learning_rate': 0.00016707336215317968, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 811.3, 'epoch': 0.29}
- 29%|████▌           | 289/1000 [1:55:45<1:31:12,  7.70s/it] 29%|████▋           | 290/1000 [1:55:53<1:31:14,  7.71s/it]                                                            {'loss': 2.6608, 'grad_norm': 0.758572518825531, 'learning_rate': 0.00016683279284936004, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 880.29, 'epoch': 0.29}
- 29%|████▋           | 290/1000 [1:55:53<1:31:14,  7.71s/it] 29%|████▋           | 291/1000 [1:56:00<1:31:05,  7.71s/it]                                                            {'loss': 2.5593, 'grad_norm': 0.7538958191871643, 'learning_rate': 0.00016659152250116812, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 988.65, 'epoch': 0.29}
- 29%|████▋           | 291/1000 [1:56:00<1:31:05,  7.71s/it] 29%|████▋           | 292/1000 [1:56:08<1:30:49,  7.70s/it]                                                            {'loss': 2.5155, 'grad_norm': 0.874915361404419, 'learning_rate': 0.00016634955363941574, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 729.33, 'epoch': 0.29}
- 29%|████▋           | 292/1000 [1:56:08<1:30:49,  7.70s/it] 29%|████▋           | 293/1000 [1:56:16<1:30:41,  7.70s/it]                                                            {'loss': 2.5294, 'grad_norm': 0.7377837896347046, 'learning_rate': 0.00016610688880224178, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 989.14, 'epoch': 0.29}
- 29%|████▋           | 293/1000 [1:56:16<1:30:41,  7.70s/it] 29%|████▋           | 294/1000 [1:56:23<1:30:28,  7.69s/it]                                                            {'loss': 2.6486, 'grad_norm': 0.9139818549156189, 'learning_rate': 0.0001658635305350855, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 702.12, 'epoch': 0.29}
- 29%|████▋           | 294/1000 [1:56:23<1:30:28,  7.69s/it] 30%|████▋           | 295/1000 [1:56:31<1:30:25,  7.70s/it]                                                            {'loss': 2.4037, 'grad_norm': 0.7837443947792053, 'learning_rate': 0.00016561948139065996, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 888.88, 'epoch': 0.29}
- 30%|████▋           | 295/1000 [1:56:31<1:30:25,  7.70s/it] 30%|████▋           | 296/1000 [1:56:39<1:30:19,  7.70s/it]                                                            {'loss': 2.5454, 'grad_norm': 0.7109882831573486, 'learning_rate': 0.00016537474392892528, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1115.75, 'epoch': 0.3}
- 30%|████▋           | 296/1000 [1:56:39<1:30:19,  7.70s/it] 30%|████▊           | 297/1000 [1:56:46<1:30:12,  7.70s/it]                                                            {'loss': 2.5067, 'grad_norm': 0.8539952039718628, 'learning_rate': 0.00016512932071706152, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 809.83, 'epoch': 0.3}
- 30%|████▊           | 297/1000 [1:56:46<1:30:12,  7.70s/it] 30%|████▊           | 298/1000 [1:56:54<1:30:02,  7.70s/it]                                                            {'loss': 2.6339, 'grad_norm': 0.8438757061958313, 'learning_rate': 0.0001648832143294422, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 906.84, 'epoch': 0.3}
- 30%|████▊           | 298/1000 [1:56:54<1:30:02,  7.70s/it] 30%|████▊           | 299/1000 [1:57:02<1:30:04,  7.71s/it]                                                            {'loss': 2.5535, 'grad_norm': 0.731607973575592, 'learning_rate': 0.0001646364273476067, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1016.47, 'epoch': 0.3}
- 30%|████▊           | 299/1000 [1:57:02<1:30:04,  7.71s/it] 30%|████▊           | 300/1000 [1:57:10<1:29:54,  7.71s/it]                                                            {'loss': 2.6206, 'grad_norm': 0.7364538908004761, 'learning_rate': 0.00016438896236023375, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1087.19, 'epoch': 0.3}
- 30%|████▊           | 300/1000 [1:57:10<1:29:54,  7.71s/it][2025-10-18 20:59:56,665] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 20:59:59,713] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4530835151672363
-[2025-10-18 21:00:01,168] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4553418159484863
-[2025-10-18 21:00:02,606] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4376530647277832
-[2025-10-18 21:00:04,035] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4281249046325684
-[2025-10-18 21:00:04,035] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|���                      | 2/179 [00:00<00:28,  6.18it/s][A
-  2%|▍                      | 3/179 [00:00<00:41,  4.20it/s][A
-  2%|▌                      | 4/179 [00:00<00:47,  3.70it/s][A
-  3%|▋                      | 5/179 [00:01<01:20,  2.17it/s][A
-  3%|▊                      | 6/179 [00:02<01:10,  2.47it/s][A
-  4%|▉                      | 7/179 [00:02<01:05,  2.64it/s][A
-  4%|█                      | 8/179 [00:02<01:01,  2.76it/s][A
-  5%|█▏                     | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▏                    | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▎                    | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▍                    | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▌                    | 13/179 [00:04<01:07,  2.47it/s][A
-  8%|█▋                    | 14/179 [00:05<01:01,  2.70it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██                    | 17/179 [00:06<01:05,  2.47it/s][A
- 10%|██▏                   | 18/179 [00:06<00:59,  2.70it/s][A
- 11%|██▎                   | 19/179 [00:06<00:56,  2.82it/s][A
- 11%|██▍                   | 20/179 [00:07<00:54,  2.90it/s][A
- 12%|██▌                   | 21/179 [00:07<01:03,  2.48it/s][A
- 12%|██▋                   | 22/179 [00:08<00:58,  2.71it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.81it/s][A
- 13%|██▉                   | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███                   | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▎                  | 27/179 [00:09<00:54,  2.78it/s][A
- 16%|███▍                  | 28/179 [00:10<00:52,  2.86it/s][A
- 16%|███▌                  | 29/179 [00:10<01:00,  2.46it/s][A
- 17%|███▋                  | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|███▊                  | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|███▉                  | 32/179 [00:11<00:51,  2.87it/s][A
- 18%|████                  | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▏                 | 34/179 [00:12<00:53,  2.69it/s][A
- 20%|████▎                 | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|████▍                 | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|████▌                 | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|████▋                 | 38/179 [00:13<00:52,  2.71it/s][A
- 22%|████▊                 | 39/179 [00:14<00:49,  2.82it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.89it/s][A
- 23%|█████                 | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.80it/s][A
- 25%|█████▍                | 44/179 [00:16<00:47,  2.86it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.46it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|█████▊                | 47/179 [00:17<00:47,  2.81it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.86it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.46it/s][A
- 28%|██████▏               | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.79it/s][A
- 29%|██████▍               | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|██████▌               | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.88it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.48it/s][A
- 32%|███████▏              | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.81it/s][A
- 34%|███████▎              | 60/179 [00:21<00:41,  2.88it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.48it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.70it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.81it/s][A
- 36%|███████▊              | 64/179 [00:23<00:39,  2.89it/s][A
- 36%|███████▉              | 65/179 [00:23<00:45,  2.48it/s][A
- 37%|████████              | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|████████▏             | 67/179 [00:24<00:39,  2.81it/s][A
- 38%|████████▎             | 68/179 [00:24<00:38,  2.88it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.81it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|████████▉             | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|█████████             | 74/179 [00:27<00:38,  2.70it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.81it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:41,  2.48it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.70it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.80it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|██████████            | 82/179 [00:30<00:36,  2.69it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:41,  2.26it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:36,  2.51it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:34,  2.66it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:32,  2.78it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:37,  2.43it/s][A
- 50%|███████████           | 90/179 [00:33<00:33,  2.66it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.75it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.82it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:35,  2.43it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.67it/s][A
- 53%|███████████▋          | 95/179 [00:35<00:30,  2.78it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:29,  2.86it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.46it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.69it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.79it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.86it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.47it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.70it/s][A
- 58%|████████████         | 103/179 [00:38<00:27,  2.79it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|████████████▌        | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.88it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:29,  2.37it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:26,  2.64it/s][A
- 62%|█████████████        | 111/179 [00:41<00:24,  2.76it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.85it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:22,  2.81it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:21,  2.86it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.45it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|█████████████▉       | 119/179 [00:44<00:21,  2.79it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.48it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.70it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:20,  2.80it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.68it/s][A
- 71%|██████████████▉      | 127/179 [00:47<00:18,  2.80it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.87it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.47it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.69it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.81it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|███████████████▋     | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|███████████████▉     | 136/179 [00:50<00:15,  2.84it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.69it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.79it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.86it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.45it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.80it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.87it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.46it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.68it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.69it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|██████████████████   | 154/179 [00:57<00:09,  2.69it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:08,  2.86it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|███████████████████  | 162/179 [01:00<00:06,  2.70it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.80it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.85it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.44it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.67it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|███████████████████▉ | 170/179 [01:03<00:03,  2.68it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.78it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.86it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.45it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.79it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.69it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.53it/s][A                                                            
-                                                            [A{'eval_loss': 2.467153549194336, 'eval_runtime': 68.7205, 'eval_samples_per_second': 2.852, 'eval_steps_per_second': 1.426, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.3}
- 30%|████▊           | 300/1000 [1:58:26<1:29:54,  7.71s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.53it/s][A
-                                                            [A[2025-10-18 21:01:12,763] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-300
- 30%|████▊           | 301/1000 [1:58:36<6:05:35, 31.38s/it]                                                            {'loss': 2.4363, 'grad_norm': 0.9879778027534485, 'learning_rate': 0.000164140821963114, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 567.49, 'epoch': 0.3}
- 30%|████▊           | 301/1000 [1:58:36<6:05:35, 31.38s/it] 30%|████▊           | 302/1000 [1:58:44<4:42:13, 24.26s/it]                                                            {'loss': 2.6001, 'grad_norm': 0.839836597442627, 'learning_rate': 0.00016389200875912278, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 803.99, 'epoch': 0.3}
- 30%|████▊           | 302/1000 [1:58:44<4:42:13, 24.26s/it] 30%|████▊           | 303/1000 [1:58:51<3:44:03, 19.29s/it]                                                            {'loss': 2.5892, 'grad_norm': 0.6544349789619446, 'learning_rate': 0.00016364252535819282, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1191.54, 'epoch': 0.3}
- 30%|████▊           | 303/1000 [1:58:52<3:44:03, 19.29s/it] 30%|████▊           | 304/1000 [1:58:59<3:03:25, 15.81s/it]                                                            {'loss': 2.6063, 'grad_norm': 0.8167876601219177, 'learning_rate': 0.000163392374377287, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 922.14, 'epoch': 0.3}
- 30%|████▊           | 304/1000 [1:58:59<3:03:25, 15.81s/it] 30%|████▉           | 305/1000 [1:59:07<2:34:55, 13.38s/it]                                                            {'loss': 2.3822, 'grad_norm': 0.7517671585083008, 'learning_rate': 0.00016314155844037074, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 932.26, 'epoch': 0.3}
- 30%|████▉           | 305/1000 [1:59:07<2:34:55, 13.38s/it] 31%|████▉           | 306/1000 [1:59:15<2:14:54, 11.66s/it]                                                            {'loss': 2.7333, 'grad_norm': 0.7878764271736145, 'learning_rate': 0.00016289008017838445, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 931.28, 'epoch': 0.31}
- 31%|████▉           | 306/1000 [1:59:15<2:14:54, 11.66s/it] 31%|████▉           | 307/1000 [1:59:22<2:00:53, 10.47s/it]                                                            {'loss': 2.3509, 'grad_norm': 0.8184928297996521, 'learning_rate': 0.0001626379422292162, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 822.3, 'epoch': 0.31}
- 31%|████▉           | 307/1000 [1:59:22<2:00:53, 10.47s/it] 31%|████▉           | 308/1000 [1:59:30<1:51:04,  9.63s/it]                                                            {'loss': 2.8412, 'grad_norm': 0.9964088797569275, 'learning_rate': 0.00016238514723767374, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 686.61, 'epoch': 0.31}
- 31%|████▉           | 308/1000 [1:59:30<1:51:04,  9.63s/it] 31%|████▉           | 309/1000 [1:59:38<1:44:10,  9.05s/it]                                                            {'loss': 2.5192, 'grad_norm': 0.743407666683197, 'learning_rate': 0.0001621316978554569, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1045.3, 'epoch': 0.31}
- 31%|████▉           | 309/1000 [1:59:38<1:44:10,  9.05s/it] 31%|████▉           | 310/1000 [1:59:45<1:39:34,  8.66s/it]                                                            {'loss': 2.7339, 'grad_norm': 0.7762504816055298, 'learning_rate': 0.00016187759674112973, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1170.03, 'epoch': 0.31}
- 31%|████▉           | 310/1000 [1:59:45<1:39:34,  8.66s/it] 31%|████▉           | 311/1000 [1:59:53<1:36:03,  8.37s/it]                                                            {'loss': 2.3895, 'grad_norm': 1.1820642948150635, 'learning_rate': 0.00016162284656009274, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 345.51, 'epoch': 0.31}
- 31%|████▉           | 311/1000 [1:59:53<1:36:03,  8.37s/it] 31%|████▉           | 312/1000 [2:00:01<1:33:33,  8.16s/it]                                                            {'loss': 2.4391, 'grad_norm': 0.7680884599685669, 'learning_rate': 0.00016136744998455476, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 877.96, 'epoch': 0.31}
- 31%|████▉           | 312/1000 [2:00:01<1:33:33,  8.16s/it] 31%|█████           | 313/1000 [2:00:08<1:31:48,  8.02s/it]                                                            {'loss': 2.5209, 'grad_norm': 1.0926076173782349, 'learning_rate': 0.00016111140969350503, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 515.52, 'epoch': 0.31}
- 31%|█████           | 313/1000 [2:00:08<1:31:48,  8.02s/it] 31%|█████           | 314/1000 [2:00:16<1:30:35,  7.92s/it]                                                            {'loss': 2.392, 'grad_norm': 0.7927546501159668, 'learning_rate': 0.00016085472837268502, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 825.13, 'epoch': 0.31}
- 31%|█████           | 314/1000 [2:00:16<1:30:35,  7.92s/it] 32%|█████           | 315/1000 [2:00:24<1:29:41,  7.86s/it]                                                            {'loss': 2.4962, 'grad_norm': 0.8119118213653564, 'learning_rate': 0.00016059740871456036, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 850.79, 'epoch': 0.32}
- 32%|█████           | 315/1000 [2:00:24<1:29:41,  7.86s/it] 32%|█████           | 316/1000 [2:00:32<1:29:06,  7.82s/it]                                                            {'loss': 2.4298, 'grad_norm': 0.7645765542984009, 'learning_rate': 0.00016033945341829248, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 996.05, 'epoch': 0.32}
- 32%|█████           | 316/1000 [2:00:32<1:29:06,  7.82s/it] 32%|█████           | 317/1000 [2:00:39<1:28:35,  7.78s/it]                                                            {'loss': 2.4209, 'grad_norm': 0.8331952691078186, 'learning_rate': 0.00016008086518971037, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 795.5, 'epoch': 0.32}
- 32%|█████           | 317/1000 [2:00:39<1:28:35,  7.78s/it] 32%|█████           | 318/1000 [2:00:47<1:28:08,  7.75s/it]                                                            {'loss': 2.6023, 'grad_norm': 0.9243016839027405, 'learning_rate': 0.0001598216467412822, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 773.18, 'epoch': 0.32}
- 32%|█████           | 318/1000 [2:00:47<1:28:08,  7.75s/it] 32%|█████           | 319/1000 [2:00:55<1:27:53,  7.74s/it]                                                            {'loss': 2.2524, 'grad_norm': 0.8096899390220642, 'learning_rate': 0.00015956180079208682, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 823.88, 'epoch': 0.32}
- 32%|█████           | 319/1000 [2:00:55<1:27:53,  7.74s/it] 32%|█████           | 320/1000 [2:01:02<1:27:41,  7.74s/it]                                                            {'loss': 2.597, 'grad_norm': 0.799497127532959, 'learning_rate': 0.0001593013300677853, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 883.85, 'epoch': 0.32}
- 32%|█████           | 320/1000 [2:01:02<1:27:41,  7.74s/it] 32%|█████▏          | 321/1000 [2:01:10<1:27:30,  7.73s/it]                                                            {'loss': 2.8253, 'grad_norm': 0.7999076843261719, 'learning_rate': 0.00015904023730059228, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 942.39, 'epoch': 0.32}
- 32%|█████▏          | 321/1000 [2:01:10<1:27:30,  7.73s/it] 32%|█████▏          | 322/1000 [2:01:18<1:27:20,  7.73s/it]                                                            {'loss': 2.5088, 'grad_norm': 0.7991549372673035, 'learning_rate': 0.00015877852522924732, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 871.95, 'epoch': 0.32}
- 32%|█████▏          | 322/1000 [2:01:18<1:27:20,  7.73s/it] 32%|█████▏          | 323/1000 [2:01:26<1:27:11,  7.73s/it]                                                            {'loss': 2.4771, 'grad_norm': 0.8044911026954651, 'learning_rate': 0.00015851619659898623, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 781.86, 'epoch': 0.32}
- 32%|█████▏          | 323/1000 [2:01:26<1:27:11,  7.73s/it] 32%|█████▏          | 324/1000 [2:01:33<1:26:58,  7.72s/it]                                                            {'loss': 2.6089, 'grad_norm': 0.84713214635849, 'learning_rate': 0.00015825325416151222, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 828.68, 'epoch': 0.32}
- 32%|█████▏          | 324/1000 [2:01:33<1:26:58,  7.72s/it] 32%|█████▏          | 325/1000 [2:01:41<1:26:53,  7.72s/it]                                                            {'loss': 2.3837, 'grad_norm': 0.6131676435470581, 'learning_rate': 0.000157989700674967, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1479.12, 'epoch': 0.33}
- 32%|█████▏          | 325/1000 [2:01:41<1:26:53,  7.72s/it] 33%|█████▏          | 326/1000 [2:01:49<1:26:40,  7.72s/it]                                                            {'loss': 2.6683, 'grad_norm': 0.8171447515487671, 'learning_rate': 0.00015772553890390197, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 880.97, 'epoch': 0.33}
- 33%|█████▏          | 326/1000 [2:01:49<1:26:40,  7.72s/it] 33%|█████▏          | 327/1000 [2:01:56<1:26:25,  7.71s/it]                                                            {'loss': 2.659, 'grad_norm': 0.8209149241447449, 'learning_rate': 0.00015746077161924905, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 962.75, 'epoch': 0.33}
- 33%|█████▏          | 327/1000 [2:01:56<1:26:25,  7.71s/it] 33%|█████▏          | 328/1000 [2:02:04<1:26:14,  7.70s/it]                                                            {'loss': 2.437, 'grad_norm': 0.7156986594200134, 'learning_rate': 0.00015719540159829184, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1032.51, 'epoch': 0.33}
- 33%|█████▏          | 328/1000 [2:02:04<1:26:14,  7.70s/it] 33%|█████▎          | 329/1000 [2:02:12<1:26:07,  7.70s/it]                                                            {'loss': 2.3491, 'grad_norm': 0.9886549711227417, 'learning_rate': 0.00015692943162463628, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 595.05, 'epoch': 0.33}
- 33%|█████▎          | 329/1000 [2:02:12<1:26:07,  7.70s/it] 33%|█████▎          | 330/1000 [2:02:19<1:25:56,  7.70s/it]                                                            {'loss': 2.4946, 'grad_norm': 1.0928308963775635, 'learning_rate': 0.0001566628644881815, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 462.16, 'epoch': 0.33}
- 33%|█████▎          | 330/1000 [2:02:19<1:25:56,  7.70s/it] 33%|█████▎          | 331/1000 [2:02:27<1:25:48,  7.70s/it]                                                            {'loss': 2.4526, 'grad_norm': 0.9105722308158875, 'learning_rate': 0.00015639570298509064, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 697.72, 'epoch': 0.33}
- 33%|█████▎          | 331/1000 [2:02:27<1:25:48,  7.70s/it] 33%|█████▎          | 332/1000 [2:02:35<1:25:48,  7.71s/it]                                                            {'loss': 2.3692, 'grad_norm': 0.7395821213722229, 'learning_rate': 0.00015612794991776147, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1154.98, 'epoch': 0.33}
- 33%|█████▎          | 332/1000 [2:02:35<1:25:48,  7.71s/it] 33%|█████▎          | 333/1000 [2:02:43<1:25:40,  7.71s/it]                                                            {'loss': 2.5842, 'grad_norm': 0.7496511340141296, 'learning_rate': 0.00015585960809479696, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1178.4, 'epoch': 0.33}
- 33%|█████▎          | 333/1000 [2:02:43<1:25:40,  7.71s/it] 33%|█████▎          | 334/1000 [2:02:50<1:25:31,  7.71s/it]                                                            {'loss': 2.4394, 'grad_norm': 0.7105236053466797, 'learning_rate': 0.00015559068033097582, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1073.84, 'epoch': 0.33}
- 33%|█████▎          | 334/1000 [2:02:50<1:25:31,  7.71s/it] 34%|█████▎          | 335/1000 [2:02:58<1:25:22,  7.70s/it]                                                            {'loss': 2.754, 'grad_norm': 0.7707606554031372, 'learning_rate': 0.00015532116944722308, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 990.66, 'epoch': 0.34}
- 34%|█████▎          | 335/1000 [2:02:58<1:25:22,  7.70s/it] 34%|█████▍          | 336/1000 [2:03:06<1:25:07,  7.69s/it]                                                            {'loss': 2.7455, 'grad_norm': 0.820377767086029, 'learning_rate': 0.00015505107827058036, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 843.72, 'epoch': 0.34}
- 34%|█████▍          | 336/1000 [2:03:06<1:25:07,  7.69s/it] 34%|█████▍          | 337/1000 [2:03:13<1:25:00,  7.69s/it]                                                            {'loss': 2.5578, 'grad_norm': 0.753650963306427, 'learning_rate': 0.0001547804096341763, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 935.51, 'epoch': 0.34}
- 34%|█████▍          | 337/1000 [2:03:13<1:25:00,  7.69s/it] 34%|█████▍          | 338/1000 [2:03:21<1:24:52,  7.69s/it]                                                            {'loss': 2.5706, 'grad_norm': 0.8526700139045715, 'learning_rate': 0.00015450916637719684, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 777.68, 'epoch': 0.34}
- 34%|█████▍          | 338/1000 [2:03:21<1:24:52,  7.69s/it] 34%|█████▍          | 339/1000 [2:03:29<1:24:46,  7.69s/it]                                                            {'loss': 2.4742, 'grad_norm': 0.7942416667938232, 'learning_rate': 0.00015423735134485536, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 874.18, 'epoch': 0.34}
- 34%|█████▍          | 339/1000 [2:03:29<1:24:46,  7.69s/it] 34%|█████▍          | 340/1000 [2:03:36<1:24:39,  7.70s/it]                                                            {'loss': 2.5067, 'grad_norm': 0.8205062747001648, 'learning_rate': 0.00015396496738836292, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 890.77, 'epoch': 0.34}
- 34%|█████▍          | 340/1000 [2:03:36<1:24:39,  7.70s/it] 34%|█████▍          | 341/1000 [2:03:44<1:24:34,  7.70s/it]                                                            {'loss': 2.5191, 'grad_norm': 0.8866893649101257, 'learning_rate': 0.0001536920173648984, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 871.06, 'epoch': 0.34}
- 34%|█████▍          | 341/1000 [2:03:44<1:24:34,  7.70s/it] 34%|█████▍          | 342/1000 [2:03:52<1:24:26,  7.70s/it]                                                            {'loss': 2.4576, 'grad_norm': 0.8298866748809814, 'learning_rate': 0.0001534185041375783, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 902.04, 'epoch': 0.34}
- 34%|█████▍          | 342/1000 [2:03:52<1:24:26,  7.70s/it] 34%|█████▍          | 343/1000 [2:04:00<1:24:24,  7.71s/it]                                                            {'loss': 2.2094, 'grad_norm': 1.0336577892303467, 'learning_rate': 0.00015314443057542703, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 570.91, 'epoch': 0.34}
- 34%|█████▍          | 343/1000 [2:04:00<1:24:24,  7.71s/it] 34%|█████▌          | 344/1000 [2:04:07<1:24:16,  7.71s/it]                                                            {'loss': 2.4435, 'grad_norm': 0.8124457597732544, 'learning_rate': 0.00015286979955334652, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1103.97, 'epoch': 0.34}
- 34%|█████▌          | 344/1000 [2:04:07<1:24:16,  7.71s/it] 34%|█████▌          | 345/1000 [2:04:15<1:24:03,  7.70s/it]                                                            {'loss': 2.6118, 'grad_norm': 0.937639594078064, 'learning_rate': 0.00015259461395208628, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 686.55, 'epoch': 0.34}
- 34%|█████▌          | 345/1000 [2:04:15<1:24:03,  7.70s/it] 35%|█████▌          | 346/1000 [2:04:23<1:23:53,  7.70s/it]                                                            {'loss': 2.3132, 'grad_norm': 0.7303231954574585, 'learning_rate': 0.000152318876658213, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1040.55, 'epoch': 0.35}
- 35%|█████▌          | 346/1000 [2:04:23<1:23:53,  7.70s/it] 35%|█████▌          | 347/1000 [2:04:30<1:23:51,  7.71s/it]                                                            {'loss': 2.4619, 'grad_norm': 0.7620224356651306, 'learning_rate': 0.00015204259056408046, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1053.43, 'epoch': 0.35}
- 35%|█████▌          | 347/1000 [2:04:30<1:23:51,  7.71s/it] 35%|█████▌          | 348/1000 [2:04:38<1:23:42,  7.70s/it]                                                            {'loss': 2.5825, 'grad_norm': 0.7714886665344238, 'learning_rate': 0.00015176575856779904, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 990.07, 'epoch': 0.35}
- 35%|█████▌          | 348/1000 [2:04:38<1:23:42,  7.70s/it] 35%|█████▌          | 349/1000 [2:04:46<1:23:31,  7.70s/it]                                                            {'loss': 2.5803, 'grad_norm': 0.838821291923523, 'learning_rate': 0.00015148838357320537, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 804.87, 'epoch': 0.35}
- 35%|█████▌          | 349/1000 [2:04:46<1:23:31,  7.70s/it] 35%|█████▌          | 350/1000 [2:04:53<1:23:23,  7.70s/it]                                                            {'loss': 2.4492, 'grad_norm': 0.9096402525901794, 'learning_rate': 0.0001512104684898319, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 693.98, 'epoch': 0.35}
- 35%|█████▌          | 350/1000 [2:04:53<1:23:23,  7.70s/it][2025-10-18 21:07:40,542] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 21:07:43,526] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4226689338684082
-[2025-10-18 21:07:44,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3892645835876465
-[2025-10-18 21:07:46,348] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4316890239715576
-[2025-10-18 21:07:47,754] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4046204090118408
-[2025-10-18 21:07:47,754] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:28,  6.12it/s][A
-  2%|▍                      | 3/179 [00:00<00:40,  4.39it/s][A
-  2%|▌                      | 4/179 [00:00<00:46,  3.79it/s][A
-  3%|▋                      | 5/179 [00:01<01:18,  2.22it/s][A
-  3%|▊                      | 6/179 [00:02<01:09,  2.51it/s][A
-  4%|▉                      | 7/179 [00:02<01:04,  2.67it/s][A
-  4%|█                      | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▏                     | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▏                    | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▎                    | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▍                    | 12/179 [00:04<00:58,  2.85it/s][A
-  7%|█▌                    | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▋                    | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██                    | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▏                   | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▎                   | 19/179 [00:06<00:57,  2.80it/s][A
- 11%|██▍                   | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▌                   | 21/179 [00:07<01:03,  2.47it/s][A
- 12%|██▋                   | 22/179 [00:07<00:58,  2.68it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.79it/s][A
- 13%|██▉                   | 24/179 [00:08<00:54,  2.86it/s][A
- 14%|███                   | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▎                  | 27/179 [00:09<00:54,  2.80it/s][A
- 16%|███▍                  | 28/179 [00:10<00:52,  2.88it/s][A
- 16%|███▌                  | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|███▋                  | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|███▊                  | 31/179 [00:11<00:52,  2.79it/s][A
- 18%|███▉                  | 32/179 [00:11<00:51,  2.87it/s][A
- 18%|████                  | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▏                 | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▎                 | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|████▍                 | 36/179 [00:13<00:49,  2.86it/s][A
- 21%|████▌                 | 37/179 [00:13<00:57,  2.46it/s][A
- 21%|████▋                 | 38/179 [00:13<00:52,  2.69it/s][A
- 22%|████▊                 | 39/179 [00:14<00:50,  2.80it/s][A
- 22%|████▉                 | 40/179 [00:14<00:48,  2.88it/s][A
- 23%|█████                 | 41/179 [00:15<00:56,  2.46it/s][A
- 23%|█████▏                | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|█████▎                | 43/179 [00:15<00:48,  2.79it/s][A
- 25%|█████▍                | 44/179 [00:16<00:47,  2.87it/s][A
- 25%|█████▌                | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|█████▋                | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|█████▊                | 47/179 [00:17<00:47,  2.79it/s][A
- 27%|█████▉                | 48/179 [00:17<00:45,  2.87it/s][A
- 27%|██████                | 49/179 [00:18<00:52,  2.47it/s][A
- 28%|██████▏               | 50/179 [00:18<00:48,  2.69it/s][A
- 28%|██████▎               | 51/179 [00:18<00:46,  2.77it/s][A
- 29%|██████▍               | 52/179 [00:19<00:44,  2.86it/s][A
- 30%|██████▌               | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|██████▋               | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|██████▊               | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████               | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|███████▏              | 58/179 [00:21<00:45,  2.68it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.80it/s][A
- 34%|███████▎              | 60/179 [00:22<00:41,  2.88it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|███████▌              | 62/179 [00:22<00:43,  2.70it/s][A
- 35%|███████▋              | 63/179 [00:23<00:41,  2.81it/s][A
- 36%|███████▊              | 64/179 [00:23<00:39,  2.88it/s][A
- 36%|███████▉              | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|████████              | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|████████▏             | 67/179 [00:24<00:39,  2.80it/s][A
- 38%|████████▎             | 68/179 [00:24<00:38,  2.89it/s][A
- 39%|████████▍             | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|████████▌             | 70/179 [00:25<00:40,  2.70it/s][A
- 40%|████████▋             | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|████████▊             | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|████████▉             | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|█████████             | 74/179 [00:27<00:38,  2.70it/s][A
- 42%|█████████▏            | 75/179 [00:27<00:37,  2.81it/s][A
- 42%|█████████▎            | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|█████████▍            | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|█████████▌            | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|█████████▋            | 79/179 [00:29<00:35,  2.80it/s][A
- 45%|█████████▊            | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|█████████▉            | 81/179 [00:29<00:39,  2.46it/s][A
- 46%|██████████            | 82/179 [00:30<00:36,  2.68it/s][A
- 46%|██████████▏           | 83/179 [00:30<00:34,  2.78it/s][A
- 47%|██████████▎           | 84/179 [00:30<00:33,  2.86it/s][A
- 47%|██████████▍           | 85/179 [00:31<00:38,  2.45it/s][A
- 48%|██████████▌           | 86/179 [00:31<00:34,  2.68it/s][A
- 49%|██████████▋           | 87/179 [00:32<00:33,  2.78it/s][A
- 49%|██████████▊           | 88/179 [00:32<00:31,  2.87it/s][A
- 50%|██████████▉           | 89/179 [00:32<00:36,  2.46it/s][A
- 50%|███████████           | 90/179 [00:33<00:33,  2.69it/s][A
- 51%|███████████▏          | 91/179 [00:33<00:31,  2.79it/s][A
- 51%|███████████▎          | 92/179 [00:33<00:30,  2.87it/s][A
- 52%|███████████▍          | 93/179 [00:34<00:34,  2.47it/s][A
- 53%|███████████▌          | 94/179 [00:34<00:31,  2.70it/s][A
- 53%|███████████▋          | 95/179 [00:35<00:29,  2.81it/s][A
- 54%|███████████▊          | 96/179 [00:35<00:28,  2.89it/s][A
- 54%|███████████▉          | 97/179 [00:35<00:33,  2.48it/s][A
- 55%|████████████          | 98/179 [00:36<00:30,  2.69it/s][A
- 55%|████████████▏         | 99/179 [00:36<00:28,  2.80it/s][A
- 56%|███████████▋         | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|███████████▊         | 101/179 [00:37<00:31,  2.47it/s][A
- 57%|███████████▉         | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|████████████         | 103/179 [00:37<00:27,  2.80it/s][A
- 58%|████████████▏        | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|████████████▎        | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|████████████▍        | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|████████████▌        | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|████████████▋        | 108/179 [00:39<00:24,  2.86it/s][A
- 61%|████████████▊        | 109/179 [00:40<00:28,  2.46it/s][A
- 61%|████████████▉        | 110/179 [00:40<00:25,  2.69it/s][A
- 62%|█████████████        | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|█████████████▏       | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|█████████████▎       | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|█████████████▎       | 114/179 [00:42<00:24,  2.68it/s][A
- 64%|█████████████▍       | 115/179 [00:42<00:23,  2.78it/s][A
- 65%|█████████████▌       | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|█████████████▋       | 117/179 [00:43<00:25,  2.44it/s][A
- 66%|█████████████▊       | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|█████████████▉       | 119/179 [00:43<00:21,  2.79it/s][A
- 67%|██████████████       | 120/179 [00:44<00:20,  2.88it/s][A
- 68%|██████████████▏      | 121/179 [00:44<00:23,  2.48it/s][A
- 68%|██████████████▎      | 122/179 [00:45<00:21,  2.70it/s][A
- 69%|██████████████▍      | 123/179 [00:45<00:19,  2.81it/s][A
- 69%|██████████████▌      | 124/179 [00:45<00:19,  2.88it/s][A
- 70%|██████████████▋      | 125/179 [00:46<00:21,  2.48it/s][A
- 70%|██████████████▊      | 126/179 [00:46<00:19,  2.70it/s][A
- 71%|██████████████▉      | 127/179 [00:46<00:18,  2.80it/s][A
- 72%|███████████████      | 128/179 [00:47<00:17,  2.87it/s][A
- 72%|███████████████▏     | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|███████████████▎     | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|███████████████▎     | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|███████████████▍     | 132/179 [00:48<00:16,  2.88it/s][A
- 74%|███████████████▌     | 133/179 [00:49<00:18,  2.46it/s][A
- 75%|███████████████▋     | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|███████████████▊     | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|███████████████▉     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|████████████████     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|████████████████▏    | 138/179 [00:51<00:15,  2.68it/s][A
- 78%|████████████████▎    | 139/179 [00:51<00:14,  2.78it/s][A
- 78%|████████████████▍    | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|████████████████▌    | 141/179 [00:52<00:15,  2.46it/s][A
- 79%|████████████████▋    | 142/179 [00:52<00:13,  2.68it/s][A
- 80%|████████████████▊    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|████████████████▉    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|█████████████████    | 145/179 [00:53<00:13,  2.44it/s][A
- 82%|█████████████████▏   | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|█████████████████▏   | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|█████████████████▎   | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|█████████████████▍   | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|█████████████████▌   | 150/179 [00:55<00:10,  2.70it/s][A
- 84%|█████████████████▋   | 151/179 [00:55<00:09,  2.81it/s][A
- 85%|█████████████████▊   | 152/179 [00:56<00:09,  2.88it/s][A
- 85%|█████████████████▉   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|██████████████████   | 154/179 [00:56<00:09,  2.69it/s][A
- 87%|██████████████████▏  | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|██████████████████▎  | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|██████████████████▍  | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|██████████████████▌  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|██████████████████▋  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|██████████████████▊  | 160/179 [00:59<00:06,  2.88it/s][A
- 90%|██████████████████▉  | 161/179 [00:59<00:07,  2.46it/s][A
- 91%|███████████████████  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|███████████████████  | 163/179 [01:00<00:05,  2.81it/s][A
- 92%|███████████████████▏ | 164/179 [01:00<00:05,  2.89it/s][A
- 92%|███████████████████▎ | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|███████████████████▍ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|███████████████████▌ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|███████████████████▋ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|███████████████████▊ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|███████████████████▉ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|████████████████████ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|████████████████████▏| 172/179 [01:03<00:02,  2.87it/s][A
- 97%|████████████████████▎| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|████████████████████▍| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|████████████████████▌| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|████████████████████▋| 176/179 [01:05<00:01,  2.85it/s][A
- 99%|████████████████████▊| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|████████████████████▉| 178/179 [01:05<00:00,  2.69it/s][A
-100%|█████████████████████| 179/179 [01:06<00:00,  2.60it/s][A                                                            
-                                                            [A{'eval_loss': 2.443500518798828, 'eval_runtime': 68.7305, 'eval_samples_per_second': 2.852, 'eval_steps_per_second': 1.426, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.35}
- 35%|█████▌          | 350/1000 [2:06:09<1:23:23,  7.70s/it]
-100%|█████████████████████| 179/179 [01:06<00:00,  2.60it/s][A
-                                                            [A[2025-10-18 21:08:56,491] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-350
- 35%|█████▌          | 351/1000 [2:06:20<5:38:12, 31.27s/it]                                                            {'loss': 2.3772, 'grad_norm': 0.813891589641571, 'learning_rate': 0.00015093201623287631, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 836.62, 'epoch': 0.35}
- 35%|█████▌          | 351/1000 [2:06:20<5:38:12, 31.27s/it] 35%|█████▋          | 352/1000 [2:06:27<4:21:07, 24.18s/it]                                                            {'loss': 2.9365, 'grad_norm': 0.7941728830337524, 'learning_rate': 0.00015065302972317108, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1097.47, 'epoch': 0.35}
- 35%|█████▋          | 352/1000 [2:06:27<4:21:07, 24.18s/it] 35%|█████▋          | 353/1000 [2:06:35<3:27:17, 19.22s/it]                                                            {'loss': 2.5405, 'grad_norm': 0.8034257888793945, 'learning_rate': 0.00015037351188715265, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 912.8, 'epoch': 0.35}
- 35%|█████▋          | 353/1000 [2:06:35<3:27:17, 19.22s/it] 35%|█████▋          | 354/1000 [2:06:43<2:49:38, 15.76s/it]                                                            {'loss': 2.3865, 'grad_norm': 0.7860594391822815, 'learning_rate': 0.00015009346565683087, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1003.7, 'epoch': 0.35}
- 35%|█████▋          | 354/1000 [2:06:43<2:49:38, 15.76s/it] 36%|█████▋          | 355/1000 [2:06:50<2:23:15, 13.33s/it]                                                            {'loss': 2.5599, 'grad_norm': 0.8608607649803162, 'learning_rate': 0.00014981289396975817, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 707.15, 'epoch': 0.35}
- 36%|█████▋          | 355/1000 [2:06:50<2:23:15, 13.33s/it] 36%|█████▋          | 356/1000 [2:06:58<2:04:55, 11.64s/it]                                                            {'loss': 2.5071, 'grad_norm': 0.7168118953704834, 'learning_rate': 0.00014953179976899878, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1166.41, 'epoch': 0.36}
- 36%|█████▋          | 356/1000 [2:06:58<2:04:55, 11.64s/it] 36%|█████▋          | 357/1000 [2:07:06<1:52:03, 10.46s/it]                                                            {'loss': 2.3815, 'grad_norm': 0.7269739508628845, 'learning_rate': 0.00014925018600309785, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1071.23, 'epoch': 0.36}
- 36%|█████▋          | 357/1000 [2:07:06<1:52:03, 10.46s/it] 36%|█████▋          | 358/1000 [2:07:13<1:43:01,  9.63s/it]                                                            {'loss': 2.7037, 'grad_norm': 0.8309071660041809, 'learning_rate': 0.0001489680556260505, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 914.08, 'epoch': 0.36}
- 36%|█████▋          | 358/1000 [2:07:13<1:43:01,  9.63s/it] 36%|█████▋          | 359/1000 [2:07:21<1:36:41,  9.05s/it]                                                            {'loss': 2.2837, 'grad_norm': 0.7140702605247498, 'learning_rate': 0.00014868541159727096, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1198.46, 'epoch': 0.36}
- 36%|█████▋          | 359/1000 [2:07:21<1:36:41,  9.05s/it] 36%|█████▊          | 360/1000 [2:07:29<1:32:12,  8.64s/it]                                                            {'loss': 2.7429, 'grad_norm': 0.9101309180259705, 'learning_rate': 0.0001484022568815613, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 842.65, 'epoch': 0.36}
- 36%|█████▊          | 360/1000 [2:07:29<1:32:12,  8.64s/it] 36%|█████▊          | 361/1000 [2:07:36<1:28:56,  8.35s/it]                                                            {'loss': 2.8159, 'grad_norm': 1.0654335021972656, 'learning_rate': 0.00014811859444908052, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 597.07, 'epoch': 0.36}
- 36%|█████▊          | 361/1000 [2:07:36<1:28:56,  8.35s/it] 36%|█████▊          | 362/1000 [2:07:44<1:26:49,  8.17s/it]                                                            {'loss': 2.4037, 'grad_norm': 0.8588113784790039, 'learning_rate': 0.00014783442727531328, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 748.24, 'epoch': 0.36}
- 36%|█████▊          | 362/1000 [2:07:44<1:26:49,  8.17s/it] 36%|█████▊          | 363/1000 [2:07:52<1:25:12,  8.03s/it]                                                            {'loss': 2.3868, 'grad_norm': 0.7777385115623474, 'learning_rate': 0.00014754975834103877, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 905.51, 'epoch': 0.36}
- 36%|█████▊          | 363/1000 [2:07:52<1:25:12,  8.03s/it] 36%|█████▊          | 364/1000 [2:08:00<1:24:02,  7.93s/it]                                                            {'loss': 2.5601, 'grad_norm': 1.0356276035308838, 'learning_rate': 0.00014726459063229945, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 604.3, 'epoch': 0.36}
- 36%|█████▊          | 364/1000 [2:08:00<1:24:02,  7.93s/it] 36%|█████▊          | 365/1000 [2:08:07<1:23:19,  7.87s/it]                                                            {'loss': 2.4824, 'grad_norm': 0.8619339466094971, 'learning_rate': 0.00014697892714036958, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 879.51, 'epoch': 0.36}
- 36%|█████▊          | 365/1000 [2:08:07<1:23:19,  7.87s/it] 37%|█████▊          | 366/1000 [2:08:15<1:22:34,  7.82s/it]                                                            {'loss': 2.9975, 'grad_norm': 0.8925931453704834, 'learning_rate': 0.00014669277086172406, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 877.48, 'epoch': 0.37}
- 37%|█████▊          | 366/1000 [2:08:15<1:22:34,  7.82s/it] 37%|█████▊          | 367/1000 [2:08:23<1:22:06,  7.78s/it]                                                            {'loss': 2.5735, 'grad_norm': 0.8300255537033081, 'learning_rate': 0.00014640612479800686, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 890.78, 'epoch': 0.37}
- 37%|█████▊          | 367/1000 [2:08:23<1:22:06,  7.78s/it] 37%|█████▉          | 368/1000 [2:08:30<1:21:37,  7.75s/it]                                                            {'loss': 2.6903, 'grad_norm': 0.9414756894111633, 'learning_rate': 0.00014611899195599953, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 763.39, 'epoch': 0.37}
- 37%|█████▉          | 368/1000 [2:08:30<1:21:37,  7.75s/it] 37%|█████▉          | 369/1000 [2:08:38<1:21:21,  7.74s/it]                                                            {'loss': 2.4977, 'grad_norm': 0.8205591440200806, 'learning_rate': 0.00014583137534758967, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 838.82, 'epoch': 0.37}
- 37%|█████▉          | 369/1000 [2:08:38<1:21:21,  7.74s/it] 37%|█████▉          | 370/1000 [2:08:46<1:21:06,  7.73s/it]                                                            {'loss': 2.5312, 'grad_norm': 0.7638918161392212, 'learning_rate': 0.0001455432779897395, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 898.87, 'epoch': 0.37}
- 37%|█████▉          | 370/1000 [2:08:46<1:21:06,  7.73s/it] 37%|█████▉          | 371/1000 [2:08:54<1:20:54,  7.72s/it]                                                            {'loss': 2.6966, 'grad_norm': 0.8035619258880615, 'learning_rate': 0.00014525470290445392, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 831.58, 'epoch': 0.37}
- 37%|█████▉          | 371/1000 [2:08:54<1:20:54,  7.72s/it] 37%|█████▉          | 372/1000 [2:09:01<1:20:42,  7.71s/it]                                                            {'loss': 2.4598, 'grad_norm': 0.8352977633476257, 'learning_rate': 0.00014496565311874902, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 776.34, 'epoch': 0.37}
- 37%|█████▉          | 372/1000 [2:09:01<1:20:42,  7.71s/it] 37%|█████▉          | 373/1000 [2:09:09<1:20:31,  7.71s/it]                                                            {'loss': 2.6271, 'grad_norm': 0.9382032155990601, 'learning_rate': 0.00014467613166462023, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 682.13, 'epoch': 0.37}
- 37%|█████▉          | 373/1000 [2:09:09<1:20:31,  7.71s/it] 37%|█████▉          | 374/1000 [2:09:17<1:20:21,  7.70s/it]                                                            {'loss': 2.5899, 'grad_norm': 0.8817945122718811, 'learning_rate': 0.0001443861415790107, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 757.96, 'epoch': 0.37}
- 37%|█████▉          | 374/1000 [2:09:17<1:20:21,  7.70s/it] 38%|██████          | 375/1000 [2:09:24<1:20:14,  7.70s/it]                                                            {'loss': 2.3897, 'grad_norm': 0.8268300294876099, 'learning_rate': 0.00014409568590377918, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 803.32, 'epoch': 0.38}
- 38%|██████          | 375/1000 [2:09:24<1:20:14,  7.70s/it] 38%|██████          | 376/1000 [2:09:32<1:20:10,  7.71s/it]                                                            {'loss': 2.6176, 'grad_norm': 0.837008535861969, 'learning_rate': 0.00014380476768566824, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 924.94, 'epoch': 0.38}
- 38%|██████          | 376/1000 [2:09:32<1:20:10,  7.71s/it] 38%|██████          | 377/1000 [2:09:40<1:20:01,  7.71s/it]                                                            {'loss': 2.4618, 'grad_norm': 0.7949783802032471, 'learning_rate': 0.00014351338997627234, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 962.41, 'epoch': 0.38}
- 38%|██████          | 377/1000 [2:09:40<1:20:01,  7.71s/it] 38%|██████          | 378/1000 [2:09:47<1:19:46,  7.70s/it]                                                            {'loss': 2.5777, 'grad_norm': 0.9026557803153992, 'learning_rate': 0.00014322155583200576, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 822.37, 'epoch': 0.38}
- 38%|██████          | 378/1000 [2:09:47<1:19:46,  7.70s/it] 38%|██████          | 379/1000 [2:09:55<1:19:28,  7.68s/it]                                                            {'loss': 2.4548, 'grad_norm': 0.9520031213760376, 'learning_rate': 0.00014292926831407061, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 618.97, 'epoch': 0.38}
- 38%|██████          | 379/1000 [2:09:55<1:19:28,  7.68s/it] 38%|██████          | 380/1000 [2:10:03<1:19:13,  7.67s/it]                                                            {'loss': 2.5319, 'grad_norm': 0.7308140993118286, 'learning_rate': 0.0001426365304884246, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1056.97, 'epoch': 0.38}
- 38%|██████          | 380/1000 [2:10:03<1:19:13,  7.67s/it] 38%|██████          | 381/1000 [2:10:10<1:18:54,  7.65s/it]                                                            {'loss': 2.4645, 'grad_norm': 0.8277641534805298, 'learning_rate': 0.00014234334542574906, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 825.78, 'epoch': 0.38}
- 38%|██████          | 381/1000 [2:10:10<1:18:54,  7.65s/it] 38%|██████          | 382/1000 [2:10:18<1:18:34,  7.63s/it]                                                            {'loss': 2.4223, 'grad_norm': 0.8942565321922302, 'learning_rate': 0.00014204971620141647, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 650.58, 'epoch': 0.38}
- 38%|██████          | 382/1000 [2:10:18<1:18:34,  7.63s/it] 38%|██████▏         | 383/1000 [2:10:25<1:18:28,  7.63s/it]                                                            {'loss': 2.5694, 'grad_norm': 0.8665900826454163, 'learning_rate': 0.00014175564589545854, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 751.31, 'epoch': 0.38}
- 38%|██████▏         | 383/1000 [2:10:26<1:18:28,  7.63s/it] 38%|██████▏         | 384/1000 [2:10:33<1:18:24,  7.64s/it]                                                            {'loss': 2.7354, 'grad_norm': 0.8844501376152039, 'learning_rate': 0.00014146113759253362, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 837.13, 'epoch': 0.38}
- 38%|██████▏         | 384/1000 [2:10:33<1:18:24,  7.64s/it] 38%|██████▏         | 385/1000 [2:10:41<1:18:20,  7.64s/it]                                                            {'loss': 2.6194, 'grad_norm': 0.95892733335495, 'learning_rate': 0.0001411661943818944, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 690.03, 'epoch': 0.39}
- 38%|██████▏         | 385/1000 [2:10:41<1:18:20,  7.64s/it] 39%|██████▏         | 386/1000 [2:10:48<1:18:08,  7.64s/it]                                                            {'loss': 2.4362, 'grad_norm': 0.9186114072799683, 'learning_rate': 0.00014087081935735564, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 652.12, 'epoch': 0.39}
- 39%|██████▏         | 386/1000 [2:10:48<1:18:08,  7.64s/it] 39%|██████▏         | 387/1000 [2:10:56<1:18:00,  7.64s/it]                                                            {'loss': 2.5661, 'grad_norm': 0.8644664883613586, 'learning_rate': 0.00014057501561726157, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 802.24, 'epoch': 0.39}
- 39%|██████▏         | 387/1000 [2:10:56<1:18:00,  7.64s/it] 39%|██████▏         | 388/1000 [2:11:04<1:17:55,  7.64s/it]                                                            {'loss': 2.4357, 'grad_norm': 0.7970757484436035, 'learning_rate': 0.0001402787862644534, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 967.19, 'epoch': 0.39}
- 39%|██████▏         | 388/1000 [2:11:04<1:17:55,  7.64s/it] 39%|██████▏         | 389/1000 [2:11:11<1:17:43,  7.63s/it]                                                            {'loss': 2.5794, 'grad_norm': 0.7623278498649597, 'learning_rate': 0.0001399821344062369, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 924.26, 'epoch': 0.39}
- 39%|██████▏         | 389/1000 [2:11:11<1:17:43,  7.63s/it] 39%|██████▏         | 390/1000 [2:11:19<1:17:35,  7.63s/it]                                                            {'loss': 2.5374, 'grad_norm': 0.9226461052894592, 'learning_rate': 0.00013968506315434974, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 597.05, 'epoch': 0.39}
- 39%|██████▏         | 390/1000 [2:11:19<1:17:35,  7.63s/it] 39%|██████▎         | 391/1000 [2:11:27<1:17:28,  7.63s/it]                                                            {'loss': 2.517, 'grad_norm': 0.7621864080429077, 'learning_rate': 0.00013938757562492873, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 939.16, 'epoch': 0.39}
- 39%|██████▎         | 391/1000 [2:11:27<1:17:28,  7.63s/it] 39%|██████▎         | 392/1000 [2:11:34<1:17:23,  7.64s/it]                                                            {'loss': 2.6206, 'grad_norm': 0.7409645318984985, 'learning_rate': 0.0001390896749384773, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1063.91, 'epoch': 0.39}
- 39%|██████▎         | 392/1000 [2:11:34<1:17:23,  7.64s/it] 39%|██████▎         | 393/1000 [2:11:42<1:17:14,  7.63s/it]                                                            {'loss': 2.5777, 'grad_norm': 0.7624543905258179, 'learning_rate': 0.00013879136421983266, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1021.85, 'epoch': 0.39}
- 39%|██████▎         | 393/1000 [2:11:42<1:17:14,  7.63s/it] 39%|██████▎         | 394/1000 [2:11:49<1:17:06,  7.63s/it]                                                            {'loss': 2.5385, 'grad_norm': 0.8827208876609802, 'learning_rate': 0.00013849264659813312, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 692.88, 'epoch': 0.39}
- 39%|██████▎         | 394/1000 [2:11:50<1:17:06,  7.63s/it] 40%|██████▎         | 395/1000 [2:11:57<1:17:04,  7.64s/it]                                                            {'loss': 2.4439, 'grad_norm': 0.7421484589576721, 'learning_rate': 0.0001381935252067852, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 972.84, 'epoch': 0.4}
- 40%|██████▎         | 395/1000 [2:11:57<1:17:04,  7.64s/it] 40%|██████▎         | 396/1000 [2:12:05<1:17:02,  7.65s/it]                                                            {'loss': 2.383, 'grad_norm': 0.6603926420211792, 'learning_rate': 0.00013789400318343068, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1249.15, 'epoch': 0.4}
- 40%|██████▎         | 396/1000 [2:12:05<1:17:02,  7.65s/it] 40%|██████▎         | 397/1000 [2:12:12<1:16:52,  7.65s/it]                                                            {'loss': 2.5581, 'grad_norm': 0.8028379678726196, 'learning_rate': 0.0001375940836699139, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 930.9, 'epoch': 0.4}
- 40%|██████▎         | 397/1000 [2:12:12<1:16:52,  7.65s/it] 40%|██████▎         | 398/1000 [2:12:20<1:16:42,  7.65s/it]                                                            {'loss': 2.6189, 'grad_norm': 0.8258647918701172, 'learning_rate': 0.0001372937698122487, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 873.5, 'epoch': 0.4}
- 40%|██████▎         | 398/1000 [2:12:20<1:16:42,  7.65s/it] 40%|██████▍         | 399/1000 [2:12:28<1:16:36,  7.65s/it]                                                            {'loss': 2.4689, 'grad_norm': 0.9657416343688965, 'learning_rate': 0.0001369930647605852, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 838.74, 'epoch': 0.4}
- 40%|██████▍         | 399/1000 [2:12:28<1:16:36,  7.65s/it] 40%|██████▍         | 400/1000 [2:12:35<1:16:25,  7.64s/it]                                                            {'loss': 2.2787, 'grad_norm': 0.8712354898452759, 'learning_rate': 0.00013669197166917723, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 775.62, 'epoch': 0.4}
- 40%|██████▍         | 400/1000 [2:12:35<1:16:25,  7.64s/it][2025-10-18 21:15:22,523] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 21:15:25,086] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.242159366607666
-[2025-10-18 21:15:26,303] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.2169339656829834
-[2025-10-18 21:15:27,524] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.2205369472503662
-[2025-10-18 21:15:28,758] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.2337253093719482
-[2025-10-18 21:15:28,758] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                               | 0/179 [00:00<?, ?it/s][A
-  1%|▎                      | 2/179 [00:00<00:28,  6.19it/s][A
-  2%|▍                      | 3/179 [00:00<00:40,  4.40it/s][A
-  2%|▌                      | 4/179 [00:00<00:45,  3.82it/s][A
-  3%|▋                      | 5/179 [00:01<01:17,  2.24it/s][A
-  3%|▊                      | 6/179 [00:02<01:08,  2.53it/s][A
-  4%|▉                      | 7/179 [00:02<01:03,  2.70it/s][A
-  4%|█                      | 8/179 [00:02<01:00,  2.82it/s][A
-  5%|█▏                     | 9/179 [00:03<01:09,  2.45it/s][A
-  6%|█▏                    | 10/179 [00:03<01:02,  2.69it/s][A
-  6%|█▎                    | 11/179 [00:03<00:59,  2.80it/s][A
-  7%|█▍                    | 12/179 [00:04<00:57,  2.90it/s][A
-  7%|█▌                    | 13/179 [00:04<01:06,  2.49it/s][A
-  8%|█▋                    | 14/179 [00:04<01:00,  2.72it/s][A
-  8%|█▊                    | 15/179 [00:05<00:58,  2.83it/s][A
-  9%|█▉                    | 16/179 [00:05<00:56,  2.90it/s][A
-  9%|██                    | 17/179 [00:06<01:08,  2.37it/s][A
- 10%|██▏                   | 18/179 [00:06<01:00,  2.66it/s][A
- 11%|██▎                   | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▍                   | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▌                   | 21/179 [00:07<01:03,  2.49it/s][A
- 12%|██▋                   | 22/179 [00:07<00:57,  2.72it/s][A
- 13%|██▊                   | 23/179 [00:08<00:55,  2.83it/s][A
- 13%|██▉                   | 24/179 [00:08<00:53,  2.91it/s][A
- 14%|███                   | 25/179 [00:09<01:01,  2.51it/s][A
- 15%|███▏                  | 26/179 [00:09<00:56,  2.73it/s][A
- 15%|███▎                  | 27/179 [00:09<00:53,  2.83it/s][A
- 16%|███▍                  | 28/179 [00:10<00:51,  2.91it/s][A
- 16%|███▌                  | 29/179 [00:10<00:59,  2.51it/s][A
- 17%|███▋                  | 30/179 [00:10<00:54,  2.74it/s][A
- 17%|███▊                  | 31/179 [00:11<00:52,  2.84it/s][A
- 18%|███▉                  | 32/179 [00:11<00:50,  2.92it/s][A
- 18%|████                  | 33/179 [00:12<00:58,  2.51it/s][A
- 19%|████▏                 | 34/179 [00:12<00:53,  2.73it/s][A
- 20%|████▎                 | 35/179 [00:12<00:50,  2.84it/s][A
- 20%|████▍                 | 36/179 [00:12<00:49,  2.92it/s][A
- 21%|████▌                 | 37/179 [00:13<00:56,  2.51it/s][A
- 21%|████▋                 | 38/179 [00:13<00:51,  2.74it/s][A
- 22%|████▊                 | 39/179 [00:14<00:49,  2.84it/s][A
- 22%|████▉                 | 40/179 [00:14<00:47,  2.92it/s][A
- 23%|█████                 | 41/179 [00:14<00:54,  2.52it/s][A
- 23%|█████▏                | 42/179 [00:15<00:49,  2.74it/s][A
- 24%|█████▎                | 43/179 [00:15<00:47,  2.85it/s][A
- 25%|█████▍                | 44/179 [00:15<00:46,  2.92it/s][A
- 25%|█████▌                | 45/179 [00:16<00:53,  2.51it/s][A
- 26%|█████▋                | 46/179 [00:16<00:48,  2.74it/s][A
- 26%|█████▊                | 47/179 [00:17<00:46,  2.84it/s][A
- 27%|█████▉                | 48/179 [00:17<00:44,  2.92it/s][A
- 27%|██████                | 49/179 [00:17<00:51,  2.50it/s][A
- 28%|██████▏               | 50/179 [00:18<00:47,  2.73it/s][A
- 28%|██████▎               | 51/179 [00:18<00:45,  2.84it/s][A
- 29%|██████▍               | 52/179 [00:18<00:43,  2.91it/s][A
- 30%|██████▌               | 53/179 [00:19<00:50,  2.50it/s][A
- 30%|██████▋               | 54/179 [00:19<00:45,  2.73it/s][A
- 31%|██████▊               | 55/179 [00:19<00:43,  2.84it/s][A
- 31%|██████▉               | 56/179 [00:20<00:42,  2.92it/s][A
- 32%|███████               | 57/179 [00:20<00:48,  2.51it/s][A
- 32%|███████▏              | 58/179 [00:21<00:44,  2.74it/s][A
- 33%|███████▎              | 59/179 [00:21<00:42,  2.84it/s][A
- 34%|███████▎              | 60/179 [00:21<00:40,  2.92it/s][A
- 34%|███████▍              | 61/179 [00:22<00:47,  2.50it/s][A
- 35%|███████▌              | 62/179 [00:22<00:42,  2.72it/s][A
- 35%|███████▋              | 63/179 [00:22<00:40,  2.83it/s][A
- 36%|███████▊              | 64/179 [00:23<00:39,  2.91it/s][A
- 36%|███████▉              | 65/179 [00:23<00:45,  2.50it/s][A
- 37%|████████              | 66/179 [00:23<00:41,  2.73it/s][A
- 37%|████████▏             | 67/179 [00:24<00:39,  2.82it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.70it/s][A
- 40%|█████████▉               | 71/179 [00:25<00:38,  2.81it/s][A
- 40%|██████████               | 72/179 [00:26<00:36,  2.90it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.49it/s][A
- 41%|██████████▎              | 74/179 [00:26<00:38,  2.72it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:36,  2.83it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.91it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:40,  2.50it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.71it/s][A
- 44%|███████████              | 79/179 [00:28<00:35,  2.82it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.90it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|███████████▍             | 82/179 [00:29<00:35,  2.71it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.85it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.46it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|████████████▏            | 87/179 [00:31<00:32,  2.80it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.88it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.49it/s][A
- 50%|████████████▌            | 90/179 [00:32<00:32,  2.71it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.81it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.90it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.49it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.72it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:29,  2.83it/s][A
- 54%|█████████████▍           | 96/179 [00:34<00:28,  2.89it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:32,  2.49it/s][A
- 55%|█████████████▋           | 98/179 [00:35<00:29,  2.70it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.81it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.86it/s][A
- 56%|█████████████▌          | 101/179 [00:36<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.81it/s][A
- 58%|█████████████▉          | 104/179 [00:37<00:26,  2.88it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|██████████████▏         | 106/179 [00:38<00:27,  2.69it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.88it/s][A
- 61%|██████████████▌         | 109/179 [00:39<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.68it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:40<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.48it/s][A
- 64%|███████████████▎        | 114/179 [00:41<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.79it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.87it/s][A
- 65%|███████████████▋        | 117/179 [00:42<00:25,  2.47it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.69it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.80it/s][A
- 67%|████████████████        | 120/179 [00:43<00:20,  2.89it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:44<00:21,  2.70it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:19,  2.80it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.89it/s][A
- 70%|████████████████▊       | 125/179 [00:45<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.79it/s][A
- 72%|█████████████████▏      | 128/179 [00:46<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:47<00:18,  2.69it/s][A
- 73%|█████████████████▌      | 131/179 [00:47<00:17,  2.80it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.88it/s][A
- 74%|█████████���███████▊      | 133/179 [00:48<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|██████████████████▏     | 136/179 [00:49<00:14,  2.87it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.47it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:15,  2.70it/s][A
- 78%|██████████████████▋     | 139/179 [00:50<00:14,  2.80it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|██████████████████▉     | 141/179 [00:51<00:15,  2.48it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:52<00:12,  2.87it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.45it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.66it/s][A
- 82%|███████████████████▋    | 147/179 [00:53<00:11,  2.77it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.84it/s][A
- 83%|███████████████████▉    | 149/179 [00:54<00:12,  2.45it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|████████████████████▍   | 152/179 [00:55<00:09,  2.86it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.46it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.69it/s][A
- 87%|████████████████████▊   | 155/179 [00:56<00:08,  2.80it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.87it/s][A
- 88%|█████████████████████   | 157/179 [00:57<00:08,  2.47it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.82it/s][A
- 89%|█████████████████████▍  | 160/179 [00:58<00:06,  2.90it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.49it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.71it/s][A
- 91%|█████████████████████▊  | 163/179 [00:59<00:05,  2.81it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.89it/s][A
- 92%|██████████████████████  | 165/179 [01:00<00:05,  2.47it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.68it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.78it/s][A
- 94%|██████████████████████▌ | 168/179 [01:01<00:03,  2.86it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:02<00:02,  2.79it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.86it/s][A
- 97%|███████████████████████▏| 173/179 [01:03<00:02,  2.45it/s][A
- 97%|███████████████████████▎| 174/179 [01:03<00:01,  2.68it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|███████████████████████▌| 176/179 [01:04<00:01,  2.88it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.70it/s][A
-100%|████████████████████████| 179/179 [01:05<00:00,  2.54it/s][A                                                            
-                                                               [A{'eval_loss': 2.418457508087158, 'eval_runtime': 68.2103, 'eval_samples_per_second': 2.873, 'eval_steps_per_second': 1.437, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.4}
- 40%|███████▌           | 400/1000 [2:13:50<1:16:25,  7.64s/it]
-100%|████████████████████████| 179/179 [01:05<00:00,  2.54it/s][A
-                                                               [A[2025-10-18 21:16:36,976] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-400
- 40%|███████▌           | 401/1000 [2:14:01<5:08:29, 30.90s/it]                                                               {'loss': 2.5914, 'grad_norm': 0.8923930525779724, 'learning_rate': 0.00013639049369634876, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 747.87, 'epoch': 0.4}
- 40%|███████▌           | 401/1000 [2:14:01<5:08:29, 30.90s/it] 40%|███████▋           | 402/1000 [2:14:08<3:58:25, 23.92s/it]                                                               {'loss': 2.502, 'grad_norm': 0.9753550887107849, 'learning_rate': 0.00013608863400446113, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 652.69, 'epoch': 0.4}
- 40%|███████▋           | 402/1000 [2:14:08<3:58:25, 23.92s/it] 40%|███████▋           | 403/1000 [2:14:16<3:09:34, 19.05s/it]                                                               {'loss': 2.5921, 'grad_norm': 0.7291651964187622, 'learning_rate': 0.00013578639575987958, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1106.1, 'epoch': 0.4}
- 40%|███████▋           | 403/1000 [2:14:16<3:09:34, 19.05s/it] 40%|███████▋           | 404/1000 [2:14:24<2:35:25, 15.65s/it]                                                               {'loss': 2.6109, 'grad_norm': 0.7746625542640686, 'learning_rate': 0.0001354837821329404, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 982.42, 'epoch': 0.4}
- 40%|███████▋           | 404/1000 [2:14:24<2:35:25, 15.65s/it] 40%|███████▋           | 405/1000 [2:14:31<2:11:29, 13.26s/it]                                                               {'loss': 2.8019, 'grad_norm': 0.8975655436515808, 'learning_rate': 0.00013518079629791724, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 863.74, 'epoch': 0.41}
- 40%|███████▋           | 405/1000 [2:14:31<2:11:29, 13.26s/it] 41%|███████▋           | 406/1000 [2:14:39<1:54:45, 11.59s/it]                                                               {'loss': 2.5341, 'grad_norm': 0.7543463110923767, 'learning_rate': 0.00013487744143298822, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1035.37, 'epoch': 0.41}
- 41%|███████▋           | 406/1000 [2:14:39<1:54:45, 11.59s/it] 41%|███████▋           | 407/1000 [2:14:47<1:43:02, 10.43s/it]                                                               {'loss': 2.2751, 'grad_norm': 0.7028976082801819, 'learning_rate': 0.0001345737207202023, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 966.06, 'epoch': 0.41}
- 41%|███████▋           | 407/1000 [2:14:47<1:43:02, 10.43s/it] 41%|███████▊           | 408/1000 [2:14:54<1:34:44,  9.60s/it]                                                               {'loss': 2.4514, 'grad_norm': 0.8045576214790344, 'learning_rate': 0.000134269637345446, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 904.15, 'epoch': 0.41}
- 41%|███████▊           | 408/1000 [2:14:54<1:34:44,  9.60s/it] 41%|███████▊           | 409/1000 [2:15:02<1:28:52,  9.02s/it]                                                               {'loss': 2.5345, 'grad_norm': 0.9659880995750427, 'learning_rate': 0.00013396519449841005, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 566.87, 'epoch': 0.41}
- 41%|███████▊           | 409/1000 [2:15:02<1:28:52,  9.02s/it] 41%|███████▊           | 410/1000 [2:15:10<1:24:47,  8.62s/it]                                                               {'loss': 2.5164, 'grad_norm': 0.8003166317939758, 'learning_rate': 0.0001336603953725559, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 927.14, 'epoch': 0.41}
- 41%|███████▊           | 410/1000 [2:15:10<1:24:47,  8.62s/it] 41%|███████▊           | 411/1000 [2:15:17<1:21:53,  8.34s/it]                                                               {'loss': 2.6755, 'grad_norm': 0.868290901184082, 'learning_rate': 0.00013335524316508208, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 797.73, 'epoch': 0.41}
- 41%|███████▊           | 411/1000 [2:15:17<1:21:53,  8.34s/it] 41%|███████▊           | 412/1000 [2:15:25<1:19:49,  8.15s/it]                                                               {'loss': 2.5262, 'grad_norm': 0.7279526591300964, 'learning_rate': 0.00013304974107689087, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1160.05, 'epoch': 0.41}
- 41%|███████▊           | 412/1000 [2:15:25<1:19:49,  8.15s/it] 41%|███████▊           | 413/1000 [2:15:33<1:18:21,  8.01s/it]                                                               {'loss': 2.4219, 'grad_norm': 0.8151289224624634, 'learning_rate': 0.00013274389231255466, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 812.71, 'epoch': 0.41}
- 41%|███████▊           | 413/1000 [2:15:33<1:18:21,  8.01s/it] 41%|███████▊           | 414/1000 [2:15:41<1:17:19,  7.92s/it]                                                               {'loss': 2.682, 'grad_norm': 0.8819336295127869, 'learning_rate': 0.00013243770008028224, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 824.02, 'epoch': 0.41}
- 41%|███████▊           | 414/1000 [2:15:41<1:17:19,  7.92s/it] 42%|███████▉           | 415/1000 [2:15:48<1:16:32,  7.85s/it]                                                               {'loss': 2.7377, 'grad_norm': 0.7311098575592041, 'learning_rate': 0.00013213116759188523, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1205.37, 'epoch': 0.41}
- 42%|███████▉           | 415/1000 [2:15:48<1:16:32,  7.85s/it] 42%|███████▉           | 416/1000 [2:15:56<1:15:59,  7.81s/it]                                                               {'loss': 2.4735, 'grad_norm': 0.8012425899505615, 'learning_rate': 0.0001318242980627444, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 873.14, 'epoch': 0.42}
- 42%|███████▉           | 416/1000 [2:15:56<1:15:59,  7.81s/it] 42%|███████▉           | 417/1000 [2:16:04<1:15:32,  7.78s/it]                                                               {'loss': 2.429, 'grad_norm': 0.7181097865104675, 'learning_rate': 0.00013151709471177588, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1094.49, 'epoch': 0.42}
- 42%|███████▉           | 417/1000 [2:16:04<1:15:32,  7.78s/it] 42%|███████▉           | 418/1000 [2:16:11<1:15:11,  7.75s/it]                                                               {'loss': 2.7837, 'grad_norm': 0.853551983833313, 'learning_rate': 0.00013120956076139746, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 778.06, 'epoch': 0.42}
- 42%|███████▉           | 418/1000 [2:16:11<1:15:11,  7.75s/it] 42%|███████▉           | 419/1000 [2:16:19<1:14:55,  7.74s/it]                                                               {'loss': 2.3333, 'grad_norm': 0.89939284324646, 'learning_rate': 0.00013090169943749476, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 646.28, 'epoch': 0.42}
- 42%|███████▉           | 419/1000 [2:16:19<1:14:55,  7.74s/it] 42%|███████▉           | 420/1000 [2:16:27<1:14:38,  7.72s/it]                                                               {'loss': 2.2275, 'grad_norm': 0.8608824014663696, 'learning_rate': 0.0001305935139693874, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 737.79, 'epoch': 0.42}
- 42%|███████▉           | 420/1000 [2:16:27<1:14:38,  7.72s/it] 42%|███████▉           | 421/1000 [2:16:34<1:14:31,  7.72s/it]                                                               {'loss': 2.4751, 'grad_norm': 0.6963393092155457, 'learning_rate': 0.00013028500758979506, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1196.93, 'epoch': 0.42}
- 42%|███████▉           | 421/1000 [2:16:34<1:14:31,  7.72s/it] 42%|████████           | 422/1000 [2:16:42<1:14:16,  7.71s/it]                                                               {'loss': 2.6107, 'grad_norm': 0.8561040759086609, 'learning_rate': 0.00012997618353480377, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 853.79, 'epoch': 0.42}
- 42%|████████           | 422/1000 [2:16:42<1:14:16,  7.71s/it] 42%|████████           | 423/1000 [2:16:50<1:14:09,  7.71s/it]                                                               {'loss': 2.363, 'grad_norm': 0.831372082233429, 'learning_rate': 0.00012966704504383168, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 818.4, 'epoch': 0.42}
- 42%|████████           | 423/1000 [2:16:50<1:14:09,  7.71s/it] 42%|████████           | 424/1000 [2:16:58<1:13:59,  7.71s/it]                                                               {'loss': 2.6113, 'grad_norm': 0.9238219261169434, 'learning_rate': 0.00012935759535959528, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 732.47, 'epoch': 0.42}
- 42%|████████           | 424/1000 [2:16:58<1:13:59,  7.71s/it] 42%|████████           | 425/1000 [2:17:05<1:13:48,  7.70s/it]                                                               {'loss': 2.416, 'grad_norm': 0.8584331274032593, 'learning_rate': 0.00012904783772807533, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 772.44, 'epoch': 0.42}
- 42%|████████           | 425/1000 [2:17:05<1:13:48,  7.70s/it] 43%|████████           | 426/1000 [2:17:13<1:13:41,  7.70s/it]                                                               {'loss': 2.5092, 'grad_norm': 0.7186541557312012, 'learning_rate': 0.00012873777539848283, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1076.3, 'epoch': 0.43}
- 43%|████████           | 426/1000 [2:17:13<1:13:41,  7.70s/it] 43%|████████           | 427/1000 [2:17:21<1:13:35,  7.71s/it]                                                               {'loss': 2.5507, 'grad_norm': 0.7716029286384583, 'learning_rate': 0.00012842741162322487, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1030.17, 'epoch': 0.43}
- 43%|████████           | 427/1000 [2:17:21<1:13:35,  7.71s/it] 43%|████████▏          | 428/1000 [2:17:28<1:13:26,  7.70s/it]                                                               {'loss': 2.3571, 'grad_norm': 0.7560110688209534, 'learning_rate': 0.00012811674965787056, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1044.38, 'epoch': 0.43}
- 43%|████████▏          | 428/1000 [2:17:28<1:13:26,  7.70s/it] 43%|████████▏          | 429/1000 [2:17:36<1:13:24,  7.71s/it]                                                               {'loss': 2.4123, 'grad_norm': 0.8426553010940552, 'learning_rate': 0.00012780579276111702, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 776.97, 'epoch': 0.43}
- 43%|████████▏          | 429/1000 [2:17:36<1:13:24,  7.71s/it] 43%|████████▏          | 430/1000 [2:17:44<1:13:10,  7.70s/it]                                                               {'loss': 2.3846, 'grad_norm': 0.806000292301178, 'learning_rate': 0.00012749454419475487, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 912.14, 'epoch': 0.43}
- 43%|████████▏          | 430/1000 [2:17:44<1:13:10,  7.70s/it] 43%|████████▏          | 431/1000 [2:17:51<1:12:59,  7.70s/it]                                                               {'loss': 2.6421, 'grad_norm': 0.8410206437110901, 'learning_rate': 0.0001271830072236343, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 803.06, 'epoch': 0.43}
- 43%|████████▏          | 431/1000 [2:17:51<1:12:59,  7.70s/it] 43%|████████▏          | 432/1000 [2:17:59<1:12:51,  7.70s/it]                                                               {'loss': 2.726, 'grad_norm': 0.9306586980819702, 'learning_rate': 0.00012687118511563075, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 748.12, 'epoch': 0.43}
- 43%|████████▏          | 432/1000 [2:17:59<1:12:51,  7.70s/it] 43%|████████▏          | 433/1000 [2:18:07<1:12:43,  7.70s/it]                                                               {'loss': 2.5208, 'grad_norm': 0.7977080941200256, 'learning_rate': 0.0001265590811416105, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 847.75, 'epoch': 0.43}
- 43%|████████▏          | 433/1000 [2:18:07<1:12:43,  7.70s/it] 43%|████████▏          | 434/1000 [2:18:14<1:12:36,  7.70s/it]                                                               {'loss': 2.4782, 'grad_norm': 0.9194881916046143, 'learning_rate': 0.0001262466985753967, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 628.41, 'epoch': 0.43}
- 43%|████████▏          | 434/1000 [2:18:15<1:12:36,  7.70s/it] 44%|████████▎          | 435/1000 [2:18:22<1:12:30,  7.70s/it]                                                               {'loss': 2.6776, 'grad_norm': 0.8599761128425598, 'learning_rate': 0.0001259340406937345, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 957.84, 'epoch': 0.43}
- 44%|████████▎          | 435/1000 [2:18:22<1:12:30,  7.70s/it] 44%|████████▎          | 436/1000 [2:18:30<1:12:20,  7.70s/it]                                                               {'loss': 2.6578, 'grad_norm': 0.8129534721374512, 'learning_rate': 0.00012562111077625722, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 813.37, 'epoch': 0.44}
- 44%|████████▎          | 436/1000 [2:18:30<1:12:20,  7.70s/it] 44%|████████▎          | 437/1000 [2:18:38<1:12:18,  7.71s/it]                                                               {'loss': 2.4761, 'grad_norm': 0.7826356291770935, 'learning_rate': 0.00012530791210545162, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 955.94, 'epoch': 0.44}
- 44%|████████▎          | 437/1000 [2:18:38<1:12:18,  7.71s/it] 44%|████████▎          | 438/1000 [2:18:45<1:12:08,  7.70s/it]                                                               {'loss': 2.5767, 'grad_norm': 0.7191296219825745, 'learning_rate': 0.00012499444796662353, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1092.59, 'epoch': 0.44}
- 44%|████████▎          | 438/1000 [2:18:45<1:12:08,  7.70s/it] 44%|████████▎          | 439/1000 [2:18:53<1:11:58,  7.70s/it]                                                               {'loss': 2.4674, 'grad_norm': 0.7478817701339722, 'learning_rate': 0.0001246807216478634, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1000.68, 'epoch': 0.44}
- 44%|████████▎          | 439/1000 [2:18:53<1:11:58,  7.70s/it] 44%|████████▎          | 440/1000 [2:19:01<1:11:46,  7.69s/it]                                                               {'loss': 2.7326, 'grad_norm': 0.8722161650657654, 'learning_rate': 0.00012436673644001197, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 841.34, 'epoch': 0.44}
- 44%|████████▎          | 440/1000 [2:19:01<1:11:46,  7.69s/it] 44%|████████▍          | 441/1000 [2:19:08<1:11:39,  7.69s/it]                                                               {'loss': 2.4877, 'grad_norm': 0.8003708124160767, 'learning_rate': 0.00012405249563662537, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1020.14, 'epoch': 0.44}
- 44%|████████▍          | 441/1000 [2:19:08<1:11:39,  7.69s/it] 44%|████████▍          | 442/1000 [2:19:16<1:11:31,  7.69s/it]                                                               {'loss': 2.6038, 'grad_norm': 0.8045440912246704, 'learning_rate': 0.00012373800253394102, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 966.05, 'epoch': 0.44}
- 44%|████████▍          | 442/1000 [2:19:16<1:11:31,  7.69s/it] 44%|████████▍          | 443/1000 [2:19:24<1:11:20,  7.68s/it]                                                               {'loss': 2.4096, 'grad_norm': 0.8588775396347046, 'learning_rate': 0.00012342326043084266, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 750.92, 'epoch': 0.44}
- 44%|████████▍          | 443/1000 [2:19:24<1:11:20,  7.68s/it] 44%|████████▍          | 444/1000 [2:19:31<1:11:11,  7.68s/it]                                                               {'loss': 2.5142, 'grad_norm': 0.8550599217414856, 'learning_rate': 0.00012310827262882615, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 821.74, 'epoch': 0.44}
- 44%|████████▍          | 444/1000 [2:19:31<1:11:11,  7.68s/it] 44%|████████▍          | 445/1000 [2:19:39<1:11:05,  7.69s/it]                                                               {'loss': 2.3919, 'grad_norm': 0.8404627442359924, 'learning_rate': 0.00012279304243196436, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 864.4, 'epoch': 0.45}
- 44%|████████▍          | 445/1000 [2:19:39<1:11:05,  7.69s/it] 45%|████████▍          | 446/1000 [2:19:47<1:10:57,  7.69s/it]                                                               {'loss': 2.4175, 'grad_norm': 0.8438916802406311, 'learning_rate': 0.00012247757314687297, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 770.48, 'epoch': 0.45}
- 45%|████████▍          | 446/1000 [2:19:47<1:10:57,  7.69s/it] 45%|████████▍          | 447/1000 [2:19:54<1:10:51,  7.69s/it]                                                               {'loss': 2.6993, 'grad_norm': 0.8843498826026917, 'learning_rate': 0.00012216186808267546, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 825.18, 'epoch': 0.45}
- 45%|████████▍          | 447/1000 [2:19:54<1:10:51,  7.69s/it] 45%|████████▌          | 448/1000 [2:20:02<1:10:49,  7.70s/it]                                                               {'loss': 2.3752, 'grad_norm': 0.7928943037986755, 'learning_rate': 0.00012184593055096854, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 799.05, 'epoch': 0.45}
- 45%|████████▌          | 448/1000 [2:20:02<1:10:49,  7.70s/it] 45%|████████▌          | 449/1000 [2:20:10<1:10:39,  7.69s/it]                                                               {'loss': 2.5344, 'grad_norm': 0.8000917434692383, 'learning_rate': 0.0001215297638657875, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 862.62, 'epoch': 0.45}
- 45%|████████▌          | 449/1000 [2:20:10<1:10:39,  7.69s/it] 45%|████████▌          | 450/1000 [2:20:18<1:10:33,  7.70s/it]                                                               {'loss': 2.541, 'grad_norm': 0.7369298338890076, 'learning_rate': 0.0001212133713435712, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1026.33, 'epoch': 0.45}
- 45%|████████▌          | 450/1000 [2:20:18<1:10:33,  7.70s/it][2025-10-18 21:23:04,711] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 21:23:07,777] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.460540533065796
-[2025-10-18 21:23:09,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4278185367584229
-[2025-10-18 21:23:10,644] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.438683271408081
-[2025-10-18 21:23:12,104] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4601521492004395
-[2025-10-18 21:23:12,105] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.25it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.35it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.73it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.79it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.42it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.66it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.85it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.46it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.69it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.47it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.80it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▉                      | 21/179 [00:07<01:03,  2.48it/s][A
- 12%|███                      | 22/179 [00:07<00:58,  2.70it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.70it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.80it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.88it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.48it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|████▎                    | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.88it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.81it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.69it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:50,  2.80it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.89it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.81it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.88it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:46,  2.81it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.89it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.48it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.70it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.80it/s][A
- 29%|███████▎                 | 52/179 [00:18<00:44,  2.88it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.69it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.80it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.89it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.81it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.88it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.70it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.80it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.88it/s][A
- 36%|█████████                | 65/179 [00:23<00:46,  2.47it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:42,  2.69it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.68it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.79it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.79it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.87it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.70it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.46it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:36,  2.69it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.47it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.79it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.86it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:33,  2.68it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.78it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.86it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.47it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.68it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:30,  2.79it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.86it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.45it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.68it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.78it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.86it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.68it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.79it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.87it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.69it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.80it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.47it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.69it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.79it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.85it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.80it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.70it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.81it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.87it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.69it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.80it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:14,  2.87it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.47it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:15,  2.70it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.80it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.79it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.87it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.47it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.69it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.46it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.78it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.88it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.70it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.47it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.88it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.46it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.87it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.78it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.78it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.86it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.48it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.70it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|███████████████████████▌| 176/179 [01:04<00:01,  2.87it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.70it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A                                                               
-                                                               [A{'eval_loss': 2.399780035018921, 'eval_runtime': 68.5691, 'eval_samples_per_second': 2.858, 'eval_steps_per_second': 1.429, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.45}
- 45%|████████▌          | 450/1000 [2:21:34<1:10:33,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A
-                                                               [A[2025-10-18 21:24:20,681] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-450
- 45%|████████▌          | 451/1000 [2:21:44<4:47:39, 31.44s/it]                                                               {'loss': 2.5206, 'grad_norm': 0.7964299917221069, 'learning_rate': 0.00012089675630312754, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 927.47, 'epoch': 0.45}
- 45%|████████▌          | 451/1000 [2:21:44<4:47:39, 31.44s/it] 45%|████████▌          | 452/1000 [2:21:52<3:41:57, 24.30s/it]                                                               {'loss': 2.3714, 'grad_norm': 0.7111493945121765, 'learning_rate': 0.00012057992206559837, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1132.27, 'epoch': 0.45}
- 45%|████████▌          | 452/1000 [2:21:52<3:41:57, 24.30s/it] 45%|████████▌          | 453/1000 [2:22:00<2:56:01, 19.31s/it]                                                               {'loss': 2.4477, 'grad_norm': 0.8085815906524658, 'learning_rate': 0.00012026287195442503, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 875.45, 'epoch': 0.45}
- 45%|████████▌          | 453/1000 [2:22:00<2:56:01, 19.31s/it] 45%|████████▋          | 454/1000 [2:22:07<2:23:52, 15.81s/it]                                                               {'loss': 2.5141, 'grad_norm': 0.8297038674354553, 'learning_rate': 0.00011994560929531309, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 851.8, 'epoch': 0.45}
- 45%|████████▋          | 454/1000 [2:22:07<2:23:52, 15.81s/it] 46%|████████▋          | 455/1000 [2:22:15<2:01:25, 13.37s/it]                                                               {'loss': 2.59, 'grad_norm': 0.776443600654602, 'learning_rate': 0.00011962813741619777, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 989.49, 'epoch': 0.46}
- 46%|████████▋          | 455/1000 [2:22:15<2:01:25, 13.37s/it] 46%|████████▋          | 456/1000 [2:22:23<1:45:40, 11.65s/it]                                                               {'loss': 2.688, 'grad_norm': 0.8000941276550293, 'learning_rate': 0.00011931045964720881, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1038.49, 'epoch': 0.46}
- 46%|████████▋          | 456/1000 [2:22:23<1:45:40, 11.65s/it] 46%|████████▋          | 457/1000 [2:22:30<1:34:43, 10.47s/it]                                                               {'loss': 2.5517, 'grad_norm': 0.7830643057823181, 'learning_rate': 0.0001189925793206357, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1037.92, 'epoch': 0.46}
- 46%|████████▋          | 457/1000 [2:22:30<1:34:43, 10.47s/it] 46%|████████▋          | 458/1000 [2:22:38<1:27:08,  9.65s/it]                                                               {'loss': 2.5708, 'grad_norm': 0.6935010552406311, 'learning_rate': 0.00011867449977089265, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1180.77, 'epoch': 0.46}
- 46%|████████▋          | 458/1000 [2:22:38<1:27:08,  9.65s/it] 46%|████████▋          | 459/1000 [2:22:46<1:21:42,  9.06s/it]                                                               {'loss': 2.4687, 'grad_norm': 0.7038302421569824, 'learning_rate': 0.00011835622433448361, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1058.52, 'epoch': 0.46}
- 46%|████████▋          | 459/1000 [2:22:46<1:21:42,  9.06s/it] 46%|████████▋          | 460/1000 [2:22:53<1:17:45,  8.64s/it]                                                               {'loss': 2.5338, 'grad_norm': 0.7659316062927246, 'learning_rate': 0.00011803775634996734, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 966.98, 'epoch': 0.46}
- 46%|████████▋          | 460/1000 [2:22:53<1:17:45,  8.64s/it] 46%|████████▊          | 461/1000 [2:23:01<1:15:05,  8.36s/it]                                                               {'loss': 2.5058, 'grad_norm': 0.7030954957008362, 'learning_rate': 0.0001177190991579223, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1146.73, 'epoch': 0.46}
- 46%|████████▊          | 461/1000 [2:23:01<1:15:05,  8.36s/it] 46%|████████▊          | 462/1000 [2:23:09<1:13:11,  8.16s/it]                                                               {'loss': 2.5484, 'grad_norm': 0.7747669816017151, 'learning_rate': 0.00011740025610091159, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 907.47, 'epoch': 0.46}
- 46%|████████▊          | 462/1000 [2:23:09<1:13:11,  8.16s/it] 46%|████████▊          | 463/1000 [2:23:17<1:11:48,  8.02s/it]                                                               {'loss': 2.5484, 'grad_norm': 0.8438674807548523, 'learning_rate': 0.00011708123052344804, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 831.44, 'epoch': 0.46}
- 46%|████████▊          | 463/1000 [2:23:17<1:11:48,  8.02s/it][2025-10-18 21:26:25,505] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42528] Loading dataset: AiAF/conversations with base_type: chat_template and prompt_style: None
-[2025-10-18 21:26:25,505] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:42528] Using chat template:
----
-{{ bos_token }}
-{% for m in messages %}
-  {% set role = 'model' if m['role']=='assistant' else 'user' %}
-  {{ '<start_of_turn>' + role + '\n' + m['content'] | trim + '<end_of_turn>\n' }}
-{% endfor %}
-{% if add_generation_prompt %}
-{{ '<start_of_turn>model\n' }}
-{% endif %}
-
----
-
-Tokenizing Prompts (num_proc=12):   0%| | 0/10000 [00:00<?, ? e[A
-Tokenizing Prompts (num_proc=12):   8%| | 834/10000 [24:35<4:30[A
-Tokenizing Prompts (num_proc=12):  17%|▏| 1667/10000 [25:37<1:4[A
-Tokenizing Prompts (num_proc=12):  25%|▎| 2501/10000 [25:55<53:[A
-Tokenizing Prompts (num_proc=12):  33%|▎| 3334/10000 [27:26<33:[A
-Tokenizing Prompts (num_proc=12):  42%|▍| 4168/10000 [27:41<19:[A
-Tokenizing Prompts (num_proc=12):  50%|▌| 5001/10000 [28:53<13:[A
-Tokenizing Prompts (num_proc=12):  58%|▌| 5834/10000 [28:59<07:[A
-Tokenizing Prompts (num_proc=12):  67%|▋| 6667/10000 [29:25<04:[A
-Tokenizing Prompts (num_proc=12):  75%|▊| 7501/10000 [30:53<03:[A
-Tokenizing Prompts (num_proc=12):  83%|▊| 8334/10000 [33:10<03:[A
-Tokenizing Prompts (num_proc=12):  92%|▉| 9167/10000 [37:39<02:[A
-Tokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [45:28<00[ATokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [45:31<00
-
-Dropping Long Sequences:   0%| | 0/10000 [00:00<?, ? examples/s[A
-Dropping Long Sequences:  10%| | 1000/10000 [00:10<01:36, 93.34[A
-Dropping Long Sequences:  20%|▏| 2000/10000 [00:19<01:15, 106.2[A
-Dropping Long Sequences:  30%|▎| 3000/10000 [00:27<01:02, 112.8[A
-Dropping Long Sequences:  40%|▍| 4000/10000 [00:35<00:51, 116.8[A
-Dropping Long Sequences:  50%|▌| 5000/10000 [00:44<00:43, 114.6[A
-Dropping Long Sequences:  60%|▌| 6000/10000 [00:53<00:34, 114.5[A
-Dropping Long Sequences:  70%|▋| 7000/10000 [01:01<00:25, 117.5[A
-Dropping Long Sequences:  80%|▊| 8000/10000 [01:10<00:17, 115.4[A
-Dropping Long Sequences:  90%|▉| 9000/10000 [01:17<00:08, 120.3[A
-Dropping Long Sequences: 100%|█| 10000/10000 [01:26<00:00, 118.[ADropping Long Sequences: 100%|█| 10000/10000 [01:26<00:00, 115.
-
-Add position_id column (Pretraining Sample Packing):   0%| | 0/[A
-Add position_id column (Pretraining Sample Packing):  53%|▌| 10[A
-Add position_id column (Pretraining Sample Packing): 100%|█| 18[AAdd position_id column (Pretraining Sample Packing): 100%|█| 18
-[2025-10-18 22:13:30,588] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42528] Using single process for pack_parallel, running sequentially.
- 46%|███████▍        | 464/1000 [3:11:00<128:44:06, 864.64s/it]                                                               {'loss': 2.4798, 'grad_norm': 0.8187122344970703, 'learning_rate': 0.00011676202577195901, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 793.16, 'epoch': 0.46}
- 46%|███████▍        | 464/1000 [3:11:00<128:44:06, 864.64s/it] 46%|███████▉         | 465/1000 [3:11:07<90:16:36, 607.47s/it]                                                               {'loss': 2.499, 'grad_norm': 0.7975175380706787, 'learning_rate': 0.0001164426451947513, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 901.43, 'epoch': 0.47}
- 46%|███████▉         | 465/1000 [3:11:07<90:16:36, 607.47s/it] 47%|███████▉         | 466/1000 [3:11:15<63:24:29, 427.47s/it]                                                               {'loss': 2.4283, 'grad_norm': 0.7642038464546204, 'learning_rate': 0.00011612309214197599, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 950.69, 'epoch': 0.47}
- 47%|███████▉         | 466/1000 [3:11:15<63:24:29, 427.47s/it] 47%|███████▉         | 467/1000 [3:11:22<44:38:09, 301.48s/it]                                                               {'loss': 2.4028, 'grad_norm': 0.7735118865966797, 'learning_rate': 0.00011580336996559343, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 971.4, 'epoch': 0.47}
- 47%|███████▉         | 467/1000 [3:11:22<44:38:09, 301.48s/it] 47%|███████▉         | 468/1000 [3:11:30<31:31:11, 213.29s/it]                                                               {'loss': 2.7025, 'grad_norm': 0.7525343298912048, 'learning_rate': 0.00011548348201933798, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1208.72, 'epoch': 0.47}
- 47%|███████▉         | 468/1000 [3:11:30<31:31:11, 213.29s/it] 47%|███████▉         | 469/1000 [3:11:37<22:21:13, 151.55s/it]                                                               {'loss': 2.3699, 'grad_norm': 0.8704997897148132, 'learning_rate': 0.00011516343165868279, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 721.07, 'epoch': 0.47}
- 47%|███████▉         | 469/1000 [3:11:37<22:21:13, 151.55s/it] 47%|███████▉         | 470/1000 [3:11:45<15:57:02, 108.34s/it]                                                               {'loss': 2.3736, 'grad_norm': 0.7402556538581848, 'learning_rate': 0.00011484322224080472, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1096.23, 'epoch': 0.47}
- 47%|███████▉         | 470/1000 [3:11:45<15:57:02, 108.34s/it] 47%|████████▍         | 471/1000 [3:11:52<11:28:39, 78.11s/it]                                                               {'loss': 2.2508, 'grad_norm': 0.7937670350074768, 'learning_rate': 0.00011452285712454904, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 924.08, 'epoch': 0.47}
- 47%|████████▍         | 471/1000 [3:11:52<11:28:39, 78.11s/it] 47%|████████▉          | 472/1000 [3:12:00<8:21:12, 56.96s/it]                                                               {'loss': 2.5329, 'grad_norm': 0.7053034901618958, 'learning_rate': 0.00011420233967039422, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1204.7, 'epoch': 0.47}
- 47%|████████▉          | 472/1000 [3:12:00<8:21:12, 56.96s/it] 47%|████████▉          | 473/1000 [3:12:08<6:10:13, 42.15s/it]                                                               {'loss': 2.5691, 'grad_norm': 0.7511913180351257, 'learning_rate': 0.00011388167324041669, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1124.02, 'epoch': 0.47}
- 47%|████████▉          | 473/1000 [3:12:08<6:10:13, 42.15s/it] 47%|█████████          | 474/1000 [3:12:15<4:38:40, 31.79s/it]                                                               {'loss': 2.512, 'grad_norm': 0.7045575976371765, 'learning_rate': 0.00011356086119825553, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1119.42, 'epoch': 0.47}
- 47%|█████████          | 474/1000 [3:12:15<4:38:40, 31.79s/it] 48%|█████████          | 475/1000 [3:12:23<3:34:40, 24.53s/it]                                                               {'loss': 2.5501, 'grad_norm': 0.8239957094192505, 'learning_rate': 0.00011323990690907733, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 869.37, 'epoch': 0.47}
- 48%|█████████          | 475/1000 [3:12:23<3:34:40, 24.53s/it] 48%|█████████          | 476/1000 [3:12:31<2:50:02, 19.47s/it]                                                               {'loss': 2.5661, 'grad_norm': 0.6598995923995972, 'learning_rate': 0.00011291881373954065, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1280.82, 'epoch': 0.48}
- 48%|█████████          | 476/1000 [3:12:31<2:50:02, 19.47s/it] 48%|█████████          | 477/1000 [3:12:38<2:18:47, 15.92s/it]                                                               {'loss': 2.4256, 'grad_norm': 0.8774147033691406, 'learning_rate': 0.00011259758505776092, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 778.15, 'epoch': 0.48}
- 48%|█████████          | 477/1000 [3:12:38<2:18:47, 15.92s/it] 48%|█████████          | 478/1000 [3:12:46<1:56:59, 13.45s/it]                                                               {'loss': 2.5325, 'grad_norm': 0.8101653456687927, 'learning_rate': 0.00011227622423327502, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 962.01, 'epoch': 0.48}
- 48%|█████████          | 478/1000 [3:12:46<1:56:59, 13.45s/it] 48%|█████████          | 479/1000 [3:12:54<1:41:38, 11.71s/it]                                                               {'loss': 2.5386, 'grad_norm': 0.804762601852417, 'learning_rate': 0.0001119547346370059, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 872.53, 'epoch': 0.48}
- 48%|█████████          | 479/1000 [3:12:54<1:41:38, 11.71s/it] 48%|█████████          | 480/1000 [3:13:01<1:30:57, 10.49s/it]                                                               {'loss': 2.532, 'grad_norm': 0.6869064569473267, 'learning_rate': 0.00011163311964122734, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1101.55, 'epoch': 0.48}
- 48%|█████████          | 480/1000 [3:13:01<1:30:57, 10.49s/it] 48%|█████████▏         | 481/1000 [3:13:09<1:23:25,  9.64s/it]                                                               {'loss': 2.3844, 'grad_norm': 0.8395105004310608, 'learning_rate': 0.00011131138261952845, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 719.41, 'epoch': 0.48}
- 48%|█████████▏         | 481/1000 [3:13:09<1:23:25,  9.64s/it] 48%|█████████▏         | 482/1000 [3:13:17<1:18:07,  9.05s/it]                                                               {'loss': 2.4296, 'grad_norm': 0.7923110127449036, 'learning_rate': 0.00011098952694677829, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 805.85, 'epoch': 0.48}
- 48%|█████████▏         | 482/1000 [3:13:17<1:18:07,  9.05s/it] 48%|█████████▏         | 483/1000 [3:13:24<1:14:28,  8.64s/it]                                                               {'loss': 2.5879, 'grad_norm': 0.6729269623756409, 'learning_rate': 0.00011066755599909064, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1235.81, 'epoch': 0.48}
- 48%|█████████▏         | 483/1000 [3:13:24<1:14:28,  8.64s/it] 48%|█████████▏         | 484/1000 [3:13:32<1:11:49,  8.35s/it]                                                               {'loss': 2.5023, 'grad_norm': 0.9864143133163452, 'learning_rate': 0.00011034547315378838, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 575.8, 'epoch': 0.48}
- 48%|█████████▏         | 484/1000 [3:13:32<1:11:49,  8.35s/it] 48%|█████████▏         | 485/1000 [3:13:40<1:10:00,  8.16s/it]                                                               {'loss': 2.4915, 'grad_norm': 0.7772215008735657, 'learning_rate': 0.00011002328178936811, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 830.94, 'epoch': 0.48}
- 48%|█████████▏         | 485/1000 [3:13:40<1:10:00,  8.16s/it] 49%|█████████▏         | 486/1000 [3:13:47<1:08:38,  8.01s/it]                                                               {'loss': 2.4713, 'grad_norm': 0.7779741883277893, 'learning_rate': 0.00010970098528546481, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 896.84, 'epoch': 0.49}
- 49%|█████████▏         | 486/1000 [3:13:47<1:08:38,  8.01s/it] 49%|█████████▎         | 487/1000 [3:13:55<1:07:41,  7.92s/it]                                                               {'loss': 2.5551, 'grad_norm': 0.7895573973655701, 'learning_rate': 0.00010937858702281631, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 893.74, 'epoch': 0.49}
- 49%|█████████▎         | 487/1000 [3:13:55<1:07:41,  7.92s/it] 49%|█████████▎         | 488/1000 [3:14:03<1:06:58,  7.85s/it]                                                               {'loss': 2.6383, 'grad_norm': 0.7889907956123352, 'learning_rate': 0.00010905609038322779, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 949.13, 'epoch': 0.49}
- 49%|█████████▎         | 488/1000 [3:14:03<1:06:58,  7.85s/it] 49%|█████████▎         | 489/1000 [3:14:10<1:06:27,  7.80s/it]                                                               {'loss': 2.4145, 'grad_norm': 0.7472605109214783, 'learning_rate': 0.0001087334987495364, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1045.07, 'epoch': 0.49}
- 49%|█████████▎         | 489/1000 [3:14:10<1:06:27,  7.80s/it] 49%|█████████▎         | 490/1000 [3:14:18<1:06:01,  7.77s/it]                                                               {'loss': 2.4559, 'grad_norm': 0.7652000784873962, 'learning_rate': 0.00010841081550557578, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 903.84, 'epoch': 0.49}
- 49%|█████████▎         | 490/1000 [3:14:18<1:06:01,  7.77s/it] 49%|█████████▎         | 491/1000 [3:14:26<1:05:41,  7.74s/it]                                                               {'loss': 2.5901, 'grad_norm': 0.8343738913536072, 'learning_rate': 0.00010808804403614043, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 851.91, 'epoch': 0.49}
- 49%|█████████▎         | 491/1000 [3:14:26<1:05:41,  7.74s/it] 49%|█████████▎         | 492/1000 [3:14:33<1:05:26,  7.73s/it]                                                               {'loss': 2.5643, 'grad_norm': 0.9088501930236816, 'learning_rate': 0.00010776518772695034, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 734.37, 'epoch': 0.49}
- 49%|█████████▎         | 492/1000 [3:14:33<1:05:26,  7.73s/it] 49%|█████████▎         | 493/1000 [3:14:41<1:05:17,  7.73s/it]                                                               {'loss': 2.4883, 'grad_norm': 0.7713831067085266, 'learning_rate': 0.0001074422499646154, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 927.05, 'epoch': 0.49}
- 49%|█████████▎         | 493/1000 [3:14:41<1:05:17,  7.73s/it] 49%|█████████▍         | 494/1000 [3:14:49<1:05:07,  7.72s/it]                                                               {'loss': 2.5538, 'grad_norm': 0.9699030518531799, 'learning_rate': 0.00010711923413659995, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 572.38, 'epoch': 0.49}
- 49%|█████████▍         | 494/1000 [3:14:49<1:05:07,  7.72s/it] 50%|█████████▍         | 495/1000 [3:14:57<1:05:02,  7.73s/it]                                                               {'loss': 2.1787, 'grad_norm': 0.7855165600776672, 'learning_rate': 0.00010679614363118717, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 894.84, 'epoch': 0.49}
- 50%|█████████▍         | 495/1000 [3:14:57<1:05:02,  7.73s/it] 50%|█████████▍         | 496/1000 [3:15:04<1:04:50,  7.72s/it]                                                               {'loss': 2.7243, 'grad_norm': 0.725069522857666, 'learning_rate': 0.00010647298183744359, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1121.58, 'epoch': 0.5}
- 50%|█████████▍         | 496/1000 [3:15:04<1:04:50,  7.72s/it] 50%|█████████▍         | 497/1000 [3:15:12<1:04:42,  7.72s/it]                                                               {'loss': 2.4495, 'grad_norm': 0.7118563055992126, 'learning_rate': 0.0001061497521451835, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1167.94, 'epoch': 0.5}
- 50%|█████████▍         | 497/1000 [3:15:12<1:04:42,  7.72s/it] 50%|█████████▍         | 498/1000 [3:15:20<1:04:34,  7.72s/it]                                                               {'loss': 2.511, 'grad_norm': 0.7984691858291626, 'learning_rate': 0.00010582645794493337, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 774.51, 'epoch': 0.5}
- 50%|█████████▍         | 498/1000 [3:15:20<1:04:34,  7.72s/it] 50%|█████████▍         | 499/1000 [3:15:27<1:04:23,  7.71s/it]                                                               {'loss': 2.5662, 'grad_norm': 0.7960017323493958, 'learning_rate': 0.00010550310262789649, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 864.23, 'epoch': 0.5}
- 50%|█████████▍         | 499/1000 [3:15:27<1:04:23,  7.71s/it] 50%|█████████▌         | 500/1000 [3:15:35<1:04:15,  7.71s/it]                                                               {'loss': 2.5542, 'grad_norm': 0.7412434816360474, 'learning_rate': 0.00010517968958591705, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1069.16, 'epoch': 0.5}
- 50%|█████████▌         | 500/1000 [3:15:35<1:04:15,  7.71s/it][2025-10-18 22:18:22,231] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 22:18:25,213] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4277255535125732
-[2025-10-18 22:18:26,589] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3759262561798096
-[2025-10-18 22:18:28,003] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4137742519378662
-[2025-10-18 22:18:29,426] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4228949546813965
-[2025-10-18 22:18:29,426] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.16it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.31it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.73it/s][A
-  3%|▋                         | 5/179 [00:01<01:19,  2.20it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.49it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.65it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.42it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.77it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.47it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██▎                      | 17/179 [00:06<01:06,  2.45it/s][A
- 10%|██▌                      | 18/179 [00:06<01:00,  2.66it/s][A
- 11%|██▋                      | 19/179 [00:06<00:58,  2.73it/s][A
- 11%|██▊                      | 20/179 [00:07<00:56,  2.81it/s][A
- 12%|██▉                      | 21/179 [00:07<01:04,  2.45it/s][A
- 12%|███                      | 22/179 [00:08<00:58,  2.67it/s][A
- 13%|███▏                     | 23/179 [00:08<00:56,  2.78it/s][A
- 13%|███▎                     | 24/179 [00:08<00:54,  2.86it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.70it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.80it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.88it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.48it/s][A
- 17%|████▏                    | 30/179 [00:10<00:54,  2.71it/s][A
- 17%|████▎                    | 31/179 [00:11<00:52,  2.82it/s][A
- 18%|████▍                    | 32/179 [00:11<00:50,  2.89it/s][A
- 18%|████▌                    | 33/179 [00:12<00:58,  2.48it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.48it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.70it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.80it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.88it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:56,  2.45it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:51,  2.67it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.79it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.87it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.80it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.87it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.47it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.71it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.80it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.86it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.47it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.70it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.80it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.69it/s][A
- 33%|████████▏                | 59/179 [00:21<00:43,  2.79it/s][A
- 34%|████████▍                | 60/179 [00:22<00:41,  2.86it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|█████████                | 65/179 [00:24<00:46,  2.46it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:42,  2.69it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.70it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.79it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|██████████▏              | 73/179 [00:27<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.70it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.79it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.86it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.46it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.68it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.46it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:36,  2.66it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.76it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.85it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.47it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.68it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.79it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.88it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:33,  2.69it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.77it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.85it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.46it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.68it/s][A
- 53%|█████████████▎           | 95/179 [00:35<00:30,  2.79it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:29,  2.86it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.70it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.80it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|█████████████▊          | 103/179 [00:38<00:27,  2.80it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.45it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.68it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.88it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.71it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.79it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.70it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.79it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.68it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.80it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.88it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.46it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.78it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.69it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.79it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.87it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.47it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.68it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.69it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.70it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.87it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.47it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.88it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.87it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.46it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.67it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.77it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.85it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.67it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.78it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.86it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.46it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.68it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.79it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.87it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.23it/s][A
- 99%|███████████████████████▊| 178/179 [01:06<00:00,  2.52it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.45it/s][A                                                               
-                                                               [A{'eval_loss': 2.36397123336792, 'eval_runtime': 68.6831, 'eval_samples_per_second': 2.854, 'eval_steps_per_second': 1.427, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.5}
- 50%|█████████▌         | 500/1000 [3:16:51<1:04:15,  7.71s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.45it/s][A
-                                                               [A[2025-10-18 22:19:38,116] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-500
- 50%|█████████▌         | 501/1000 [3:17:01<4:20:23, 31.31s/it]                                                               {'loss': 2.5186, 'grad_norm': 0.6549438238143921, 'learning_rate': 0.00010485622221144484, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1316.18, 'epoch': 0.5}
- 50%|█████████▌         | 501/1000 [3:17:01<4:20:23, 31.31s/it] 50%|█████████▌         | 502/1000 [3:17:09<3:21:03, 24.22s/it]                                                               {'loss': 2.5455, 'grad_norm': 0.7539265751838684, 'learning_rate': 0.00010453270389749957, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1020.48, 'epoch': 0.5}
- 50%|█████████▌         | 502/1000 [3:17:09<3:21:03, 24.22s/it] 50%|█████████▌         | 503/1000 [3:17:17<2:39:35, 19.27s/it]                                                               {'loss': 2.4376, 'grad_norm': 0.6721911430358887, 'learning_rate': 0.00010420913803763521, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1213.57, 'epoch': 0.5}
- 50%|█████████▌         | 503/1000 [3:17:17<2:39:35, 19.27s/it] 50%|█████████▌         | 504/1000 [3:17:25<2:10:33, 15.79s/it]                                                               {'loss': 2.5178, 'grad_norm': 0.6837849617004395, 'learning_rate': 0.00010388552802590462, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1259.04, 'epoch': 0.5}
- 50%|█████████▌         | 504/1000 [3:17:25<2:10:33, 15.79s/it] 50%|█████████▌         | 505/1000 [3:17:32<1:50:11, 13.36s/it]                                                               {'loss': 2.5799, 'grad_norm': 0.8399882912635803, 'learning_rate': 0.00010356187725682359, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 827.11, 'epoch': 0.51}
- 50%|█████████▌         | 505/1000 [3:17:32<1:50:11, 13.36s/it] 51%|█████████▌         | 506/1000 [3:17:40<1:35:55, 11.65s/it]                                                               {'loss': 2.5622, 'grad_norm': 0.7216302156448364, 'learning_rate': 0.00010323818912533561, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1084.83, 'epoch': 0.51}
- 51%|█████████▌         | 506/1000 [3:17:40<1:35:55, 11.65s/it] 51%|█████████▋         | 507/1000 [3:17:48<1:26:00, 10.47s/it]                                                               {'loss': 2.4854, 'grad_norm': 0.7245450019836426, 'learning_rate': 0.00010291446702677599, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 992.78, 'epoch': 0.51}
- 51%|█████████▋         | 507/1000 [3:17:48<1:26:00, 10.47s/it] 51%|█████████▋         | 508/1000 [3:17:55<1:19:00,  9.64s/it]                                                               {'loss': 2.4567, 'grad_norm': 0.7922020554542542, 'learning_rate': 0.00010259071435683636, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 880.0, 'epoch': 0.51}
- 51%|█████████▋         | 508/1000 [3:17:55<1:19:00,  9.64s/it] 51%|█████████▋         | 509/1000 [3:18:03<1:14:06,  9.06s/it]                                                               {'loss': 2.5165, 'grad_norm': 0.6978681087493896, 'learning_rate': 0.000102266934511529, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1122.15, 'epoch': 0.51}
- 51%|█████████▋         | 509/1000 [3:18:03<1:14:06,  9.06s/it] 51%|█████████▋         | 510/1000 [3:18:11<1:10:33,  8.64s/it]                                                               {'loss': 2.4381, 'grad_norm': 0.9567455649375916, 'learning_rate': 0.00010194313088715135, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 524.05, 'epoch': 0.51}
- 51%|█████████▋         | 510/1000 [3:18:11<1:10:33,  8.64s/it] 51%|█████████▋         | 511/1000 [3:18:18<1:08:03,  8.35s/it]                                                               {'loss': 2.5738, 'grad_norm': 0.9128502607345581, 'learning_rate': 0.00010161930688025017, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 599.38, 'epoch': 0.51}
- 51%|█████████▋         | 511/1000 [3:18:18<1:08:03,  8.35s/it] 51%|█████████▋         | 512/1000 [3:18:26<1:06:14,  8.14s/it]                                                               {'loss': 2.5372, 'grad_norm': 0.9160046577453613, 'learning_rate': 0.00010129546588758605, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 766.38, 'epoch': 0.51}
- 51%|█████████▋         | 512/1000 [3:18:26<1:06:14,  8.14s/it] 51%|█████████▋         | 513/1000 [3:18:34<1:04:59,  8.01s/it]                                                               {'loss': 2.4814, 'grad_norm': 0.8545426726341248, 'learning_rate': 0.00010097161130609773, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 735.64, 'epoch': 0.51}
- 51%|█████████▋         | 513/1000 [3:18:34<1:04:59,  8.01s/it] 51%|█████████▊         | 514/1000 [3:18:41<1:04:05,  7.91s/it]                                                               {'loss': 2.7467, 'grad_norm': 0.9519962668418884, 'learning_rate': 0.00010064774653286661, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 727.96, 'epoch': 0.51}
- 51%|█████████▊         | 514/1000 [3:18:41<1:04:05,  7.91s/it] 52%|█████████▊         | 515/1000 [3:18:49<1:03:24,  7.84s/it]                                                               {'loss': 2.6442, 'grad_norm': 0.805483877658844, 'learning_rate': 0.00010032387496508089, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 913.49, 'epoch': 0.52}
- 52%|█████████▊         | 515/1000 [3:18:49<1:03:24,  7.84s/it] 52%|█████████▊         | 516/1000 [3:18:57<1:02:51,  7.79s/it]                                                               {'loss': 2.5398, 'grad_norm': 0.8770183324813843, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 814.37, 'epoch': 0.52}
- 52%|█████████▊         | 516/1000 [3:18:57<1:02:51,  7.79s/it] 52%|█████████▊         | 517/1000 [3:19:04<1:02:30,  7.77s/it]                                                               {'loss': 2.3568, 'grad_norm': 0.7383714914321899, 'learning_rate': 9.967612503491914e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 942.65, 'epoch': 0.52}
- 52%|█████████▊         | 517/1000 [3:19:04<1:02:30,  7.77s/it] 52%|█████████▊         | 518/1000 [3:19:12<1:02:10,  7.74s/it]                                                               {'loss': 2.5, 'grad_norm': 0.7110471129417419, 'learning_rate': 9.935225346713341e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1221.38, 'epoch': 0.52}
- 52%|█████████▊         | 518/1000 [3:19:12<1:02:10,  7.74s/it] 52%|█████████▊         | 519/1000 [3:19:20<1:02:02,  7.74s/it]                                                               {'loss': 2.4752, 'grad_norm': 0.7059134840965271, 'learning_rate': 9.902838869390229e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1125.24, 'epoch': 0.52}
- 52%|█████████▊         | 519/1000 [3:19:20<1:02:02,  7.74s/it] 52%|█████████▉         | 520/1000 [3:19:28<1:01:51,  7.73s/it]                                                               {'loss': 2.4835, 'grad_norm': 0.8114177584648132, 'learning_rate': 9.870453411241399e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 807.55, 'epoch': 0.52}
- 52%|█████████▉         | 520/1000 [3:19:28<1:01:51,  7.73s/it] 52%|█████████▉         | 521/1000 [3:19:35<1:01:41,  7.73s/it]                                                               {'loss': 2.7055, 'grad_norm': 0.7000157237052917, 'learning_rate': 9.838069311974986e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1201.38, 'epoch': 0.52}
- 52%|█████████▉         | 521/1000 [3:19:35<1:01:41,  7.73s/it] 52%|█████████▉         | 522/1000 [3:19:43<1:01:32,  7.72s/it]                                                               {'loss': 2.5521, 'grad_norm': 0.6917127370834351, 'learning_rate': 9.805686911284868e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1232.01, 'epoch': 0.52}
- 52%|█████████▉         | 522/1000 [3:19:43<1:01:32,  7.72s/it] 52%|█████████▉         | 523/1000 [3:19:51<1:01:20,  7.72s/it]                                                               {'loss': 2.4026, 'grad_norm': 0.6785117387771606, 'learning_rate': 9.7733065488471e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1231.7, 'epoch': 0.52}
- 52%|█████████▉         | 523/1000 [3:19:51<1:01:20,  7.72s/it] 52%|█████████▉         | 524/1000 [3:19:58<1:01:12,  7.72s/it]                                                               {'loss': 2.8254, 'grad_norm': 0.6703070998191833, 'learning_rate': 9.740928564316368e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1304.94, 'epoch': 0.52}
- 52%|█████████▉         | 524/1000 [3:19:58<1:01:12,  7.72s/it] 52%|█████████▉         | 525/1000 [3:20:06<1:01:00,  7.71s/it]                                                               {'loss': 2.5118, 'grad_norm': 0.8120262026786804, 'learning_rate': 9.708553297322406e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 883.44, 'epoch': 0.53}
- 52%|█████████▉         | 525/1000 [3:20:06<1:01:00,  7.71s/it] 53%|█████████▉         | 526/1000 [3:20:14<1:00:57,  7.72s/it]                                                               {'loss': 2.3318, 'grad_norm': 0.7423614263534546, 'learning_rate': 9.676181087466444e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 862.93, 'epoch': 0.53}
- 53%|█████████▉         | 526/1000 [3:20:14<1:00:57,  7.72s/it] 53%|██████████         | 527/1000 [3:20:22<1:00:46,  7.71s/it]                                                               {'loss': 2.6191, 'grad_norm': 0.6963997483253479, 'learning_rate': 9.643812274317644e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1112.06, 'epoch': 0.53}
- 53%|██████████         | 527/1000 [3:20:22<1:00:46,  7.71s/it] 53%|██████████         | 528/1000 [3:20:29<1:00:44,  7.72s/it]                                                               {'loss': 2.1918, 'grad_norm': 0.6949625611305237, 'learning_rate': 9.611447197409543e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1049.2, 'epoch': 0.53}
- 53%|██████████         | 528/1000 [3:20:29<1:00:44,  7.72s/it] 53%|██████████         | 529/1000 [3:20:37<1:00:46,  7.74s/it]                                                               {'loss': 2.5624, 'grad_norm': 0.8772874474525452, 'learning_rate': 9.579086196236482e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 768.59, 'epoch': 0.53}
- 53%|██████████         | 529/1000 [3:20:37<1:00:46,  7.74s/it] 53%|██████████         | 530/1000 [3:20:45<1:00:37,  7.74s/it]                                                               {'loss': 2.4266, 'grad_norm': 0.7758255004882812, 'learning_rate': 9.54672961025005e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 997.45, 'epoch': 0.53}
- 53%|██████████         | 530/1000 [3:20:45<1:00:37,  7.74s/it] 53%|██████████         | 531/1000 [3:20:53<1:00:33,  7.75s/it]                                                               {'loss': 2.5699, 'grad_norm': 0.7565093636512756, 'learning_rate': 9.514377778855521e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1017.17, 'epoch': 0.53}
- 53%|██████████         | 531/1000 [3:20:53<1:00:33,  7.75s/it] 53%|██████████         | 532/1000 [3:21:00<1:00:24,  7.75s/it]                                                               {'loss': 2.4882, 'grad_norm': 0.7254500389099121, 'learning_rate': 9.482031041408296e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 943.67, 'epoch': 0.53}
- 53%|██████████         | 532/1000 [3:21:00<1:00:24,  7.75s/it] 53%|██████████▏        | 533/1000 [3:21:08<1:00:16,  7.74s/it]                                                               {'loss': 2.402, 'grad_norm': 0.736881673336029, 'learning_rate': 9.449689737210352e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 959.01, 'epoch': 0.53}
- 53%|██████████▏        | 533/1000 [3:21:08<1:00:16,  7.74s/it] 53%|██████████▏        | 534/1000 [3:21:16<1:00:11,  7.75s/it]                                                               {'loss': 2.6478, 'grad_norm': 0.8288142085075378, 'learning_rate': 9.417354205506663e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 813.19, 'epoch': 0.53}
- 53%|██████████▏        | 534/1000 [3:21:16<1:00:11,  7.75s/it] 54%|██████████▏        | 535/1000 [3:21:24<1:00:02,  7.75s/it]                                                               {'loss': 2.3929, 'grad_norm': 0.8261322379112244, 'learning_rate': 9.385024785481654e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 787.01, 'epoch': 0.54}
- 54%|██████████▏        | 535/1000 [3:21:24<1:00:02,  7.75s/it] 54%|███████████▎         | 536/1000 [3:21:31<59:59,  7.76s/it]                                                               {'loss': 2.5318, 'grad_norm': 0.6833271980285645, 'learning_rate': 9.352701816255643e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1154.72, 'epoch': 0.54}
- 54%|███████████▎         | 536/1000 [3:21:31<59:59,  7.76s/it] 54%|███████████▎         | 537/1000 [3:21:39<59:50,  7.75s/it]                                                               {'loss': 2.6518, 'grad_norm': 0.8162757158279419, 'learning_rate': 9.320385636881283e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 858.7, 'epoch': 0.54}
- 54%|███████████▎         | 537/1000 [3:21:39<59:50,  7.75s/it] 54%|███████████▎         | 538/1000 [3:21:47<59:35,  7.74s/it]                                                               {'loss': 2.4516, 'grad_norm': 0.7262734174728394, 'learning_rate': 9.288076586340006e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1049.46, 'epoch': 0.54}
- 54%|███████████▎         | 538/1000 [3:21:47<59:35,  7.74s/it] 54%|███████████▎         | 539/1000 [3:21:54<59:20,  7.72s/it]                                                               {'loss': 2.5044, 'grad_norm': 0.8444436192512512, 'learning_rate': 9.255775003538462e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 852.72, 'epoch': 0.54}
- 54%|███████████▎         | 539/1000 [3:21:55<59:20,  7.72s/it] 54%|███████████▎         | 540/1000 [3:22:02<59:14,  7.73s/it]                                                               {'loss': 2.5365, 'grad_norm': 0.7934097647666931, 'learning_rate': 9.223481227304968e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 970.19, 'epoch': 0.54}
- 54%|███████████▎         | 540/1000 [3:22:02<59:14,  7.73s/it] 54%|███████████▎         | 541/1000 [3:22:10<59:02,  7.72s/it]                                                               {'loss': 2.5046, 'grad_norm': 0.7740036249160767, 'learning_rate': 9.19119559638596e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 881.82, 'epoch': 0.54}
- 54%|███████████▎         | 541/1000 [3:22:10<59:02,  7.72s/it] 54%|███████████▍         | 542/1000 [3:22:18<58:59,  7.73s/it]                                                               {'loss': 2.5782, 'grad_norm': 0.675055205821991, 'learning_rate': 9.158918449442423e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1189.59, 'epoch': 0.54}
- 54%|███████████▍         | 542/1000 [3:22:18<58:59,  7.73s/it] 54%|███████████▍         | 543/1000 [3:22:25<58:52,  7.73s/it]                                                               {'loss': 2.396, 'grad_norm': 1.0443894863128662, 'learning_rate': 9.126650125046361e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 510.54, 'epoch': 0.54}
- 54%|███████████▍         | 543/1000 [3:22:25<58:52,  7.73s/it] 54%|███████████▍         | 544/1000 [3:22:33<58:43,  7.73s/it]                                                               {'loss': 2.432, 'grad_norm': 0.7712026834487915, 'learning_rate': 9.094390961677223e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 896.89, 'epoch': 0.54}
- 54%|███████████▍         | 544/1000 [3:22:33<58:43,  7.73s/it] 55%|███████████▍         | 545/1000 [3:22:41<58:30,  7.71s/it]                                                               {'loss': 2.2573, 'grad_norm': 0.8501310348510742, 'learning_rate': 9.062141297718371e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 696.14, 'epoch': 0.55}
- 55%|███████████▍         | 545/1000 [3:22:41<58:30,  7.71s/it] 55%|███████████▍         | 546/1000 [3:22:49<58:21,  7.71s/it]                                                               {'loss': 2.2829, 'grad_norm': 0.7467702031135559, 'learning_rate': 9.02990147145352e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 969.04, 'epoch': 0.55}
- 55%|███████████▍         | 546/1000 [3:22:49<58:21,  7.71s/it] 55%|███████████▍         | 547/1000 [3:22:56<58:08,  7.70s/it]                                                               {'loss': 2.406, 'grad_norm': 0.9024806022644043, 'learning_rate': 8.997671821063191e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 706.58, 'epoch': 0.55}
- 55%|███████████▍         | 547/1000 [3:22:56<58:08,  7.70s/it] 55%|███████████▌         | 548/1000 [3:23:04<58:02,  7.71s/it]                                                               {'loss': 2.501, 'grad_norm': 0.7381258010864258, 'learning_rate': 8.965452684621164e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1096.72, 'epoch': 0.55}
- 55%|███████████▌         | 548/1000 [3:23:04<58:02,  7.71s/it] 55%|███████████▌         | 549/1000 [3:23:12<57:52,  7.70s/it]                                                               {'loss': 2.5192, 'grad_norm': 0.8458703756332397, 'learning_rate': 8.933244400090937e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 908.12, 'epoch': 0.55}
- 55%|███████████▌         | 549/1000 [3:23:12<57:52,  7.70s/it] 55%|███████████▌         | 550/1000 [3:23:19<57:43,  7.70s/it]                                                               {'loss': 2.6825, 'grad_norm': 0.7619299292564392, 'learning_rate': 8.901047305322172e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 969.66, 'epoch': 0.55}
- 55%|███████████▌         | 550/1000 [3:23:19<57:43,  7.70s/it][2025-10-18 22:26:06,420] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 22:26:09,420] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4275126457214355
-[2025-10-18 22:26:10,856] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4356119632720947
-[2025-10-18 22:26:12,288] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4315285682678223
-[2025-10-18 22:26:13,708] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4194743633270264
-[2025-10-18 22:26:13,708] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.17it/s][A
-  2%|▍                         | 3/179 [00:00<00:41,  4.26it/s][A
-  2%|▌                         | 4/179 [00:00<00:47,  3.70it/s][A
-  3%|▋                         | 5/179 [00:01<01:19,  2.19it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.47it/s][A
-  4%|█                         | 7/179 [00:02<01:05,  2.64it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.76it/s][A
-  5%|█▎                        | 9/179 [00:03<01:11,  2.39it/s][A
-  6%|█▍                       | 10/179 [00:03<01:04,  2.63it/s][A
-  6%|█▌                       | 11/179 [00:03<01:01,  2.75it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.83it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|██▏                      | 16/179 [00:05<00:57,  2.85it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▌                      | 18/179 [00:06<01:00,  2.68it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.86it/s][A
- 12%|██▉                      | 21/179 [00:07<01:03,  2.47it/s][A
- 12%|███                      | 22/179 [00:08<00:58,  2.70it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.48it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|████▎                    | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.86it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:59,  2.38it/s][A
- 21%|█████▎                   | 38/179 [00:14<00:53,  2.61it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:50,  2.75it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.85it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.82it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.90it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:53,  2.52it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:48,  2.74it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:46,  2.85it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:44,  2.92it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:51,  2.52it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.74it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.84it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:43,  2.91it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:50,  2.51it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:45,  2.74it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:43,  2.85it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.93it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:48,  2.50it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.72it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.81it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.88it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.48it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.69it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|█████████                | 65/179 [00:23<00:46,  2.47it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:39,  2.81it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.89it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:45,  2.43it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.66it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.78it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.86it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:43,  2.46it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.70it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.79it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:36,  2.86it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.46it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.68it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:35,  2.69it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:37,  2.47it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.68it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.80it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.88it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.48it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:32,  2.71it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.81it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.89it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.48it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.71it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:29,  2.81it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.69it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.79it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.88it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.48it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.79it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.45it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.68it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.86it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.68it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.79it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.87it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.47it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.70it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.80it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.78it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.67it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.77it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.81it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.43it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.66it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.77it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.84it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.45it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.67it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.77it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.85it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.46it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.67it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.46it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.68it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.69it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.45it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.70it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.80it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.48it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.70it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.80it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.88it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.68it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.87it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.70it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.81it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.88it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.70it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.56it/s][A                                                               
-                                                               [A{'eval_loss': 2.348385810852051, 'eval_runtime': 68.5554, 'eval_samples_per_second': 2.859, 'eval_steps_per_second': 1.429, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.55}
- 55%|███████████▌         | 550/1000 [3:24:35<57:43,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.56it/s][A
-                                                               [A[2025-10-18 22:27:22,270] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-550
- 55%|██████████▍        | 551/1000 [3:24:45<3:53:49, 31.25s/it]                                                               {'loss': 2.7841, 'grad_norm': 0.7912028431892395, 'learning_rate': 8.868861738047158e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 965.77, 'epoch': 0.55}
- 55%|██████████▍        | 551/1000 [3:24:45<3:53:49, 31.25s/it] 55%|██████████▍        | 552/1000 [3:24:53<3:00:34, 24.18s/it]                                                               {'loss': 2.5921, 'grad_norm': 0.6774265170097351, 'learning_rate': 8.836688035877267e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1323.82, 'epoch': 0.55}
- 55%|██████████▍        | 552/1000 [3:24:53<3:00:34, 24.18s/it] 55%|██████████▌        | 553/1000 [3:25:01<2:23:16, 19.23s/it]                                                               {'loss': 2.493, 'grad_norm': 0.7535359859466553, 'learning_rate': 8.804526536299413e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 926.23, 'epoch': 0.55}
- 55%|██████████▌        | 553/1000 [3:25:01<2:23:16, 19.23s/it] 55%|██████████▌        | 554/1000 [3:25:09<1:57:05, 15.75s/it]                                                               {'loss': 2.4585, 'grad_norm': 0.7500110268592834, 'learning_rate': 8.772377576672502e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 988.74, 'epoch': 0.55}
- 55%|██████████▌        | 554/1000 [3:25:09<1:57:05, 15.75s/it] 56%|██████████▌        | 555/1000 [3:25:16<1:38:53, 13.33s/it]                                                               {'loss': 2.4314, 'grad_norm': 0.756388247013092, 'learning_rate': 8.740241494223911e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 949.4, 'epoch': 0.56}
- 56%|██████████▌        | 555/1000 [3:25:16<1:38:53, 13.33s/it] 56%|██████████▌        | 556/1000 [3:25:24<1:26:06, 11.64s/it]                                                               {'loss': 2.3465, 'grad_norm': 0.8125672936439514, 'learning_rate': 8.70811862604594e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 792.73, 'epoch': 0.56}
- 56%|██████████▌        | 556/1000 [3:25:24<1:26:06, 11.64s/it] 56%|██████████▌        | 557/1000 [3:25:32<1:17:15, 10.46s/it]                                                               {'loss': 2.416, 'grad_norm': 0.7166879177093506, 'learning_rate': 8.676009309092272e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1069.57, 'epoch': 0.56}
- 56%|██████████▌        | 557/1000 [3:25:32<1:17:15, 10.46s/it] 56%|██████████▌        | 558/1000 [3:25:39<1:10:59,  9.64s/it]                                                               {'loss': 2.3493, 'grad_norm': 0.6673253178596497, 'learning_rate': 8.643913880174448e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1157.33, 'epoch': 0.56}
- 56%|██████████▌        | 558/1000 [3:25:39<1:10:59,  9.64s/it] 56%|██████████▌        | 559/1000 [3:25:47<1:06:35,  9.06s/it]                                                               {'loss': 2.3661, 'grad_norm': 0.6836730241775513, 'learning_rate': 8.611832675958336e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1158.77, 'epoch': 0.56}
- 56%|██████████▌        | 559/1000 [3:25:47<1:06:35,  9.06s/it] 56%|██████████▋        | 560/1000 [3:25:55<1:03:26,  8.65s/it]                                                               {'loss': 2.4316, 'grad_norm': 0.7587626576423645, 'learning_rate': 8.579766032960582e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 951.1, 'epoch': 0.56}
- 56%|██████████▋        | 560/1000 [3:25:55<1:03:26,  8.65s/it] 56%|██████████▋        | 561/1000 [3:26:02<1:01:11,  8.36s/it]                                                               {'loss': 2.6071, 'grad_norm': 0.7275318503379822, 'learning_rate': 8.5477142875451e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1266.8, 'epoch': 0.56}
- 56%|██████████▋        | 561/1000 [3:26:02<1:01:11,  8.36s/it] 56%|███████████▊         | 562/1000 [3:26:10<59:35,  8.16s/it]                                                               {'loss': 2.5722, 'grad_norm': 0.8268294334411621, 'learning_rate': 8.515677775919527e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1130.79, 'epoch': 0.56}
- 56%|███████████▊         | 562/1000 [3:26:10<59:35,  8.16s/it] 56%|███████████▊         | 563/1000 [3:26:18<58:26,  8.02s/it]                                                               {'loss': 2.4569, 'grad_norm': 0.968206524848938, 'learning_rate': 8.48365683413172e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 611.53, 'epoch': 0.56}
- 56%|███████████▊         | 563/1000 [3:26:18<58:26,  8.02s/it] 56%|███████████▊         | 564/1000 [3:26:25<57:34,  7.92s/it]                                                               {'loss': 2.4221, 'grad_norm': 0.9141545295715332, 'learning_rate': 8.451651798066203e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 648.56, 'epoch': 0.56}
- 56%|███████████▊         | 564/1000 [3:26:25<57:34,  7.92s/it] 56%|███████████▊         | 565/1000 [3:26:33<56:56,  7.85s/it]                                                               {'loss': 2.6403, 'grad_norm': 0.8103004097938538, 'learning_rate': 8.419663003440657e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 783.9, 'epoch': 0.56}
- 56%|███████████▊         | 565/1000 [3:26:33<56:56,  7.85s/it] 57%|███████████▉         | 566/1000 [3:26:41<56:30,  7.81s/it]                                                               {'loss': 2.4694, 'grad_norm': 0.7508212327957153, 'learning_rate': 8.387690785802402e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 962.66, 'epoch': 0.57}
- 57%|███████████▉         | 566/1000 [3:26:41<56:30,  7.81s/it] 57%|███████████▉         | 567/1000 [3:26:49<56:08,  7.78s/it]                                                               {'loss': 2.531, 'grad_norm': 0.8748713731765747, 'learning_rate': 8.355735480524874e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 740.43, 'epoch': 0.57}
- 57%|███████████▉         | 567/1000 [3:26:49<56:08,  7.78s/it] 57%|███████████▉         | 568/1000 [3:26:56<55:55,  7.77s/it]                                                               {'loss': 2.4077, 'grad_norm': 0.8315381407737732, 'learning_rate': 8.323797422804099e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 779.31, 'epoch': 0.57}
- 57%|███████████▉         | 568/1000 [3:26:56<55:55,  7.77s/it] 57%|███████████▉         | 569/1000 [3:27:04<55:41,  7.75s/it]                                                               {'loss': 2.4068, 'grad_norm': 0.7292707562446594, 'learning_rate': 8.291876947655196e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1015.22, 'epoch': 0.57}
- 57%|███████████▉         | 569/1000 [3:27:04<55:41,  7.75s/it] 57%|███████████▉         | 570/1000 [3:27:12<55:33,  7.75s/it]                                                               {'loss': 2.4669, 'grad_norm': 0.6813160181045532, 'learning_rate': 8.259974389908842e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1148.42, 'epoch': 0.57}
- 57%|███████████▉         | 570/1000 [3:27:12<55:33,  7.75s/it] 57%|███████████▉         | 571/1000 [3:27:20<55:26,  7.75s/it]                                                               {'loss': 2.6319, 'grad_norm': 0.8669239282608032, 'learning_rate': 8.228090084207774e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 699.21, 'epoch': 0.57}
- 57%|███████████▉         | 571/1000 [3:27:20<55:26,  7.75s/it] 57%|████████████         | 572/1000 [3:27:27<55:17,  7.75s/it]                                                               {'loss': 2.5816, 'grad_norm': 0.7362913489341736, 'learning_rate': 8.196224365003267e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1005.87, 'epoch': 0.57}
- 57%|████████████         | 572/1000 [3:27:27<55:17,  7.75s/it] 57%|████████████         | 573/1000 [3:27:35<55:07,  7.75s/it]                                                               {'loss': 2.2189, 'grad_norm': 0.8342780470848083, 'learning_rate': 8.16437756655164e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 736.3, 'epoch': 0.57}
- 57%|████████████         | 573/1000 [3:27:35<55:07,  7.75s/it] 57%|████████████         | 574/1000 [3:27:43<54:57,  7.74s/it]                                                               {'loss': 2.5392, 'grad_norm': 0.9883151054382324, 'learning_rate': 8.132550022910737e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 591.36, 'epoch': 0.57}
- 57%|████████████         | 574/1000 [3:27:43<54:57,  7.74s/it] 57%|████████████         | 575/1000 [3:27:51<54:52,  7.75s/it]                                                               {'loss': 2.4561, 'grad_norm': 0.7164676785469055, 'learning_rate': 8.100742067936431e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1114.35, 'epoch': 0.57}
- 57%|████████████         | 575/1000 [3:27:51<54:52,  7.75s/it] 58%|████████████         | 576/1000 [3:27:58<54:46,  7.75s/it]                                                               {'loss': 2.3648, 'grad_norm': 0.7581331133842468, 'learning_rate': 8.068954035279121e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 894.27, 'epoch': 0.58}
- 58%|████████████         | 576/1000 [3:27:58<54:46,  7.75s/it] 58%|████████████         | 577/1000 [3:28:06<54:34,  7.74s/it]                                                               {'loss': 2.3076, 'grad_norm': 0.8559712171554565, 'learning_rate': 8.037186258380226e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 701.37, 'epoch': 0.58}
- 58%|████████████         | 577/1000 [3:28:06<54:34,  7.74s/it] 58%|████████████▏        | 578/1000 [3:28:14<54:30,  7.75s/it]                                                               {'loss': 2.2871, 'grad_norm': 0.697607159614563, 'learning_rate': 8.005439070468692e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1218.73, 'epoch': 0.58}
- 58%|████████████▏        | 578/1000 [3:28:14<54:30,  7.75s/it] 58%|████████████▏        | 579/1000 [3:28:21<54:15,  7.73s/it]                                                               {'loss': 2.5493, 'grad_norm': 0.7945352792739868, 'learning_rate': 7.973712804557501e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 967.63, 'epoch': 0.58}
- 58%|████████████▏        | 579/1000 [3:28:21<54:15,  7.73s/it] 58%|████████████▏        | 580/1000 [3:28:29<54:04,  7.72s/it]                                                               {'loss': 2.3485, 'grad_norm': 0.8525161743164062, 'learning_rate': 7.942007793440164e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 785.91, 'epoch': 0.58}
- 58%|████████████▏        | 580/1000 [3:28:29<54:04,  7.72s/it] 58%|████████████▏        | 581/1000 [3:28:37<53:54,  7.72s/it]                                                               {'loss': 2.5032, 'grad_norm': 0.858935534954071, 'learning_rate': 7.91032436968725e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 824.91, 'epoch': 0.58}
- 58%|████████████▏        | 581/1000 [3:28:37<53:54,  7.72s/it] 58%|████████████▏        | 582/1000 [3:28:45<53:44,  7.72s/it]                                                               {'loss': 2.6223, 'grad_norm': 0.9014143347740173, 'learning_rate': 7.878662865642881e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 785.26, 'epoch': 0.58}
- 58%|████████████▏        | 582/1000 [3:28:45<53:44,  7.72s/it] 58%|████████████▏        | 583/1000 [3:28:52<53:34,  7.71s/it]                                                               {'loss': 2.3661, 'grad_norm': 0.8252571225166321, 'learning_rate': 7.847023613421251e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 774.19, 'epoch': 0.58}
- 58%|████████████▏        | 583/1000 [3:28:52<53:34,  7.71s/it] 58%|████████████▎        | 584/1000 [3:29:00<53:26,  7.71s/it]                                                               {'loss': 2.3263, 'grad_norm': 0.8073137998580933, 'learning_rate': 7.815406944903147e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 950.37, 'epoch': 0.58}
- 58%|████████████▎        | 584/1000 [3:29:00<53:26,  7.71s/it] 58%|████████████▎        | 585/1000 [3:29:08<53:16,  7.70s/it]                                                               {'loss': 2.2897, 'grad_norm': 0.7658067345619202, 'learning_rate': 7.78381319173246e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 896.37, 'epoch': 0.58}
- 58%|████████████▎        | 585/1000 [3:29:08<53:16,  7.70s/it] 59%|████████████▎        | 586/1000 [3:29:15<53:06,  7.70s/it]                                                               {'loss': 2.4489, 'grad_norm': 0.8862242102622986, 'learning_rate': 7.75224268531271e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 751.05, 'epoch': 0.59}
- 59%|████████████▎        | 586/1000 [3:29:15<53:06,  7.70s/it] 59%|████████████▎        | 587/1000 [3:29:23<53:04,  7.71s/it]                                                               {'loss': 2.6466, 'grad_norm': 0.6905261874198914, 'learning_rate': 7.72069575680357e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1276.14, 'epoch': 0.59}
- 59%|████████████▎        | 587/1000 [3:29:23<53:04,  7.71s/it] 59%|████████████▎        | 588/1000 [3:29:31<52:52,  7.70s/it]                                                               {'loss': 2.5385, 'grad_norm': 0.8508994579315186, 'learning_rate': 7.689172737117389e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 833.57, 'epoch': 0.59}
- 59%|████████████▎        | 588/1000 [3:29:31<52:52,  7.70s/it] 59%|████████████▎        | 589/1000 [3:29:38<52:44,  7.70s/it]                                                               {'loss': 2.499, 'grad_norm': 1.05769944190979, 'learning_rate': 7.657673956915735e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 552.84, 'epoch': 0.59}
- 59%|████████████▎        | 589/1000 [3:29:38<52:44,  7.70s/it] 59%|████████████▍        | 590/1000 [3:29:46<52:32,  7.69s/it]                                                               {'loss': 2.4375, 'grad_norm': 0.8643378019332886, 'learning_rate': 7.626199746605903e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 804.46, 'epoch': 0.59}
- 59%|████████████▍        | 590/1000 [3:29:46<52:32,  7.69s/it] 59%|████████████▍        | 591/1000 [3:29:54<52:25,  7.69s/it]                                                               {'loss': 2.5396, 'grad_norm': 0.6926016807556152, 'learning_rate': 7.594750436337467e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1381.61, 'epoch': 0.59}
- 59%|████████████▍        | 591/1000 [3:29:54<52:25,  7.69s/it] 59%|████████████▍        | 592/1000 [3:30:02<52:15,  7.69s/it]                                                               {'loss': 2.3787, 'grad_norm': 0.7245242595672607, 'learning_rate': 7.563326355998803e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1001.07, 'epoch': 0.59}
- 59%|████████████▍        | 592/1000 [3:30:02<52:15,  7.69s/it] 59%|████████████▍        | 593/1000 [3:30:09<52:06,  7.68s/it]                                                               {'loss': 2.4813, 'grad_norm': 0.8943284153938293, 'learning_rate': 7.531927835213656e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 725.56, 'epoch': 0.59}
- 59%|████████████▍        | 593/1000 [3:30:09<52:06,  7.68s/it] 59%|████████████▍        | 594/1000 [3:30:17<52:04,  7.69s/it]                                                               {'loss': 2.4416, 'grad_norm': 0.7557161450386047, 'learning_rate': 7.500555203337647e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1083.59, 'epoch': 0.59}
- 59%|████████████▍        | 594/1000 [3:30:17<52:04,  7.69s/it] 60%|████████████▍        | 595/1000 [3:30:25<51:59,  7.70s/it]                                                               {'loss': 2.3139, 'grad_norm': 0.7276597023010254, 'learning_rate': 7.469208789454838e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1035.94, 'epoch': 0.59}
- 60%|████████████▍        | 595/1000 [3:30:25<51:59,  7.70s/it] 60%|████████████▌        | 596/1000 [3:30:32<51:46,  7.69s/it]                                                               {'loss': 2.4906, 'grad_norm': 0.796618640422821, 'learning_rate': 7.437888922374276e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 960.91, 'epoch': 0.6}
- 60%|████████████▌        | 596/1000 [3:30:32<51:46,  7.69s/it] 60%|████████████▌        | 597/1000 [3:30:40<51:40,  7.69s/it]                                                               {'loss': 2.2598, 'grad_norm': 0.8772359490394592, 'learning_rate': 7.40659593062655e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 674.23, 'epoch': 0.6}
- 60%|████████████▌        | 597/1000 [3:30:40<51:40,  7.69s/it] 60%|████████████▌        | 598/1000 [3:30:48<51:33,  7.69s/it]                                                               {'loss': 2.5746, 'grad_norm': 0.7713838815689087, 'learning_rate': 7.37533014246033e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 938.81, 'epoch': 0.6}
- 60%|████████████▌        | 598/1000 [3:30:48<51:33,  7.69s/it] 60%|████████████▌        | 599/1000 [3:30:55<51:27,  7.70s/it]                                                               {'loss': 2.4332, 'grad_norm': 0.7264821529388428, 'learning_rate': 7.344091885838948e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1095.31, 'epoch': 0.6}
- 60%|████████████▌        | 599/1000 [3:30:55<51:27,  7.70s/it] 60%|████████████▌        | 600/1000 [3:31:03<51:19,  7.70s/it]                                                               {'loss': 2.6304, 'grad_norm': 0.7772645950317383, 'learning_rate': 7.312881488436927e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 993.43, 'epoch': 0.6}
- 60%|████████████▌        | 600/1000 [3:31:03<51:19,  7.70s/it][2025-10-18 22:33:50,233] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 22:33:53,294] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4668872356414795
-[2025-10-18 22:33:54,731] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.436579704284668
-[2025-10-18 22:33:56,150] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.418311595916748
-[2025-10-18 22:33:57,648] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.497866153717041
-[2025-10-18 22:33:57,648] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.19it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.33it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.78it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.67it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.79it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.66it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.78it/s][A
-  7%|█▋                       | 12/179 [00:04<00:57,  2.88it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.46it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.78it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.87it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.87it/s][A
- 12%|██▉                      | 21/179 [00:07<01:04,  2.47it/s][A
- 12%|███                      | 22/179 [00:07<00:58,  2.69it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.86it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.46it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|████▎                    | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.88it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.71it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.81it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.70it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.82it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.89it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.69it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.78it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:47,  2.87it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.78it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.88it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.47it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.70it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.81it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.88it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.47it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.69it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.80it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.89it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.80it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.88it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.48it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.70it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.88it/s][A
- 36%|█████████                | 65/179 [00:23<00:45,  2.48it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.71it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:39,  2.80it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.88it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.46it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.68it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.79it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.86it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:43,  2.46it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:39,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.88it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.70it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.81it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|███████████���             | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:35,  2.71it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.47it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.80it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.87it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:32,  2.70it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.78it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.86it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.46it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.69it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:30,  2.80it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.69it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.79it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:34,  2.27it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:29,  2.58it/s][A
- 58%|█████████████▊          | 103/179 [00:38<00:27,  2.72it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.80it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.42it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.66it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:26,  2.77it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.85it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.46it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.69it/s][A
- 62%|██████████████▉         | 111/179 [00:41<00:24,  2.79it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.80it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.44it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.67it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.78it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.86it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.46it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.46it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.68it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.79it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.45it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.78it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.86it/s][A
- 74%|██████��██████████▊      | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.80it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:14,  2.88it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:16,  2.48it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.70it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.80it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.87it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.68it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.77it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.84it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.45it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.70it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.46it/s][A
- 86%|████████████████████▋   | 154/179 [00:57<00:09,  2.69it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.81it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.89it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.49it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.70it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.81it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.87it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.48it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.70it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.80it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.88it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.70it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.79it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.86it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.46it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.70it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.79it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.87it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.70it/s][A
-100%|█████���██████████████████| 179/179 [01:06<00:00,  2.53it/s][A                                                               
-                                                               [A{'eval_loss': 2.3277626037597656, 'eval_runtime': 68.707, 'eval_samples_per_second': 2.853, 'eval_steps_per_second': 1.426, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.6}
- 60%|████████████▌        | 600/1000 [3:32:19<51:19,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.53it/s][A
-                                                               [A[2025-10-18 22:35:06,362] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-600
- 60%|███████████▍       | 601/1000 [3:32:29<3:28:08, 31.30s/it]                                                               {'loss': 2.5774, 'grad_norm': 0.8923317193984985, 'learning_rate': 7.281699277636572e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 746.83, 'epoch': 0.6}
- 60%|███████████▍       | 601/1000 [3:32:29<3:28:08, 31.30s/it] 60%|███████████▍       | 602/1000 [3:32:37<2:40:30, 24.20s/it]                                                               {'loss': 2.3598, 'grad_norm': 0.8658432960510254, 'learning_rate': 7.250545580524515e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 809.73, 'epoch': 0.6}
- 60%|███████████▍       | 602/1000 [3:32:37<2:40:30, 24.20s/it] 60%|███████████▍       | 603/1000 [3:32:45<2:07:19, 19.24s/it]                                                               {'loss': 2.5175, 'grad_norm': 0.7809834480285645, 'learning_rate': 7.2194207238883e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 954.94, 'epoch': 0.6}
- 60%|███████████▍       | 603/1000 [3:32:45<2:07:19, 19.24s/it] 60%|███████████▍       | 604/1000 [3:32:52<1:44:02, 15.76s/it]                                                               {'loss': 2.4052, 'grad_norm': 0.679754912853241, 'learning_rate': 7.188325034212943e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1135.7, 'epoch': 0.6}
- 60%|███████████▍       | 604/1000 [3:32:52<1:44:02, 15.76s/it] 60%|███████████▍       | 605/1000 [3:33:00<1:27:46, 13.33s/it]                                                               {'loss': 2.5981, 'grad_norm': 0.9048441052436829, 'learning_rate': 7.157258837677514e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 727.82, 'epoch': 0.6}
- 60%|███████████▍       | 605/1000 [3:33:00<1:27:46, 13.33s/it] 61%|███████████▌       | 606/1000 [3:33:08<1:16:27, 11.64s/it]                                                               {'loss': 2.5815, 'grad_norm': 0.8156535625457764, 'learning_rate': 7.126222460151719e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 847.78, 'epoch': 0.61}
- 61%|███████████▌       | 606/1000 [3:33:08<1:16:27, 11.64s/it] 61%|███████████▌       | 607/1000 [3:33:15<1:08:31, 10.46s/it]                                                               {'loss': 2.464, 'grad_norm': 0.8041353225708008, 'learning_rate': 7.095216227192467e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 897.06, 'epoch': 0.61}
- 61%|███████████▌       | 607/1000 [3:33:15<1:08:31, 10.46s/it] 61%|███████████▌       | 608/1000 [3:33:23<1:02:52,  9.62s/it]                                                               {'loss': 2.5744, 'grad_norm': 0.8446195125579834, 'learning_rate': 7.064240464040473e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 843.28, 'epoch': 0.61}
- 61%|███████████▌       | 608/1000 [3:33:23<1:02:52,  9.62s/it] 61%|████████████▊        | 609/1000 [3:33:31<58:56,  9.04s/it]                                                               {'loss': 2.6261, 'grad_norm': 0.7581222653388977, 'learning_rate': 7.033295495616834e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1062.51, 'epoch': 0.61}
- 61%|████████████▊        | 609/1000 [3:33:31<58:56,  9.04s/it] 61%|████████████▊        | 610/1000 [3:33:39<56:03,  8.62s/it]                                                               {'loss': 2.3681, 'grad_norm': 0.9517714977264404, 'learning_rate': 7.002381646519625e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 556.04, 'epoch': 0.61}
- 61%|████████████▊        | 610/1000 [3:33:39<56:03,  8.62s/it] 61%|████████████▊        | 611/1000 [3:33:46<54:07,  8.35s/it]                                                               {'loss': 2.5815, 'grad_norm': 0.8389623165130615, 'learning_rate': 6.971499241020495e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 768.65, 'epoch': 0.61}
- 61%|████████████▊        | 611/1000 [3:33:46<54:07,  8.35s/it] 61%|████████████▊        | 612/1000 [3:33:54<52:42,  8.15s/it]                                                               {'loss': 2.4862, 'grad_norm': 0.8437785506248474, 'learning_rate': 6.940648603061263e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 912.58, 'epoch': 0.61}
- 61%|████████████▊        | 612/1000 [3:33:54<52:42,  8.15s/it] 61%|████████████▊        | 613/1000 [3:34:02<51:43,  8.02s/it]                                                               {'loss': 2.168, 'grad_norm': 0.8672930002212524, 'learning_rate': 6.909830056250527e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 861.46, 'epoch': 0.61}
- 61%|████████████▊        | 613/1000 [3:34:02<51:43,  8.02s/it] 61%|████████████▉        | 614/1000 [3:34:09<50:58,  7.92s/it]                                                               {'loss': 2.302, 'grad_norm': 0.7975133657455444, 'learning_rate': 6.879043923860257e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 833.02, 'epoch': 0.61}
- 61%|████████████▉        | 614/1000 [3:34:09<50:58,  7.92s/it] 62%|████████████▉        | 615/1000 [3:34:17<50:24,  7.85s/it]                                                               {'loss': 2.6643, 'grad_norm': 0.8281568288803101, 'learning_rate': 6.848290528822416e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 912.99, 'epoch': 0.61}
- 62%|████████████▉        | 615/1000 [3:34:17<50:24,  7.85s/it] 62%|████████████▉        | 616/1000 [3:34:25<49:57,  7.81s/it]                                                               {'loss': 2.5776, 'grad_norm': 0.7583959698677063, 'learning_rate': 6.817570193725564e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1076.63, 'epoch': 0.62}
- 62%|████████████▉        | 616/1000 [3:34:25<49:57,  7.81s/it] 62%|████████████▉        | 617/1000 [3:34:32<49:36,  7.77s/it]                                                               {'loss': 2.4297, 'grad_norm': 0.7941475510597229, 'learning_rate': 6.786883240811479e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 883.61, 'epoch': 0.62}
- 62%|████████████▉        | 617/1000 [3:34:32<49:36,  7.77s/it] 62%|████████████▉        | 618/1000 [3:34:40<49:20,  7.75s/it]                                                               {'loss': 2.6729, 'grad_norm': 0.8197606801986694, 'learning_rate': 6.756229991971779e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 951.46, 'epoch': 0.62}
- 62%|████████████▉        | 618/1000 [3:34:40<49:20,  7.75s/it] 62%|████████████▉        | 619/1000 [3:34:48<49:05,  7.73s/it]                                                               {'loss': 2.5645, 'grad_norm': 0.7754367589950562, 'learning_rate': 6.725610768744534e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1025.74, 'epoch': 0.62}
- 62%|████████████▉        | 619/1000 [3:34:48<49:05,  7.73s/it] 62%|█████████████        | 620/1000 [3:34:55<48:51,  7.71s/it]                                                               {'loss': 2.449, 'grad_norm': 0.7631040215492249, 'learning_rate': 6.695025892310914e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 967.86, 'epoch': 0.62}
- 62%|█████████████        | 620/1000 [3:34:55<48:51,  7.71s/it] 62%|█████████████        | 621/1000 [3:35:03<48:37,  7.70s/it]                                                               {'loss': 2.5488, 'grad_norm': 0.8603218197822571, 'learning_rate': 6.664475683491796e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 746.57, 'epoch': 0.62}
- 62%|█████████████        | 621/1000 [3:35:03<48:37,  7.70s/it] 62%|█████████████        | 622/1000 [3:35:11<48:28,  7.70s/it]                                                               {'loss': 2.279, 'grad_norm': 0.9059550762176514, 'learning_rate': 6.633960462744416e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 730.93, 'epoch': 0.62}
- 62%|█████████████        | 622/1000 [3:35:11<48:28,  7.70s/it] 62%|█████████████        | 623/1000 [3:35:19<48:23,  7.70s/it]                                                               {'loss': 2.6607, 'grad_norm': 0.7636969089508057, 'learning_rate': 6.603480550158995e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1063.5, 'epoch': 0.62}
- 62%|█████████████        | 623/1000 [3:35:19<48:23,  7.70s/it] 62%|█████████████        | 624/1000 [3:35:26<48:13,  7.70s/it]                                                               {'loss': 2.3411, 'grad_norm': 0.9675534963607788, 'learning_rate': 6.5730362654554e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 731.77, 'epoch': 0.62}
- 62%|█████████████        | 624/1000 [3:35:26<48:13,  7.70s/it] 62%|█████████████▏       | 625/1000 [3:35:34<48:05,  7.69s/it]                                                               {'loss': 2.5218, 'grad_norm': 0.8488072752952576, 'learning_rate': 6.542627927979771e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 781.48, 'epoch': 0.62}
- 62%|█████████████▏       | 625/1000 [3:35:34<48:05,  7.69s/it] 63%|█████████████▏       | 626/1000 [3:35:42<47:59,  7.70s/it]                                                               {'loss': 2.383, 'grad_norm': 0.7528684735298157, 'learning_rate': 6.512255856701177e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 955.59, 'epoch': 0.63}
- 63%|█████████████▏       | 626/1000 [3:35:42<47:59,  7.70s/it] 63%|█████████████▏       | 627/1000 [3:35:49<47:50,  7.70s/it]                                                               {'loss': 2.4217, 'grad_norm': 0.9540525674819946, 'learning_rate': 6.481920370208274e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 585.73, 'epoch': 0.63}
- 63%|█████████████▏       | 627/1000 [3:35:49<47:50,  7.70s/it] 63%|█████████████▏       | 628/1000 [3:35:57<47:49,  7.71s/it]                                                               {'loss': 2.4975, 'grad_norm': 0.6794441342353821, 'learning_rate': 6.451621786705962e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1261.51, 'epoch': 0.63}
- 63%|█████████████▏       | 628/1000 [3:35:57<47:49,  7.71s/it] 63%|█████████████▏       | 629/1000 [3:36:05<47:36,  7.70s/it]                                                               {'loss': 2.3908, 'grad_norm': 0.9456835985183716, 'learning_rate': 6.42136042401204e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 657.36, 'epoch': 0.63}
- 63%|█████████████▏       | 629/1000 [3:36:05<47:36,  7.70s/it] 63%|█████████████▏       | 630/1000 [3:36:12<47:29,  7.70s/it]                                                               {'loss': 2.4599, 'grad_norm': 0.6179218888282776, 'learning_rate': 6.39113659955389e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1461.62, 'epoch': 0.63}
- 63%|█████████████▏       | 630/1000 [3:36:12<47:29,  7.70s/it] 63%|█████████████▎       | 631/1000 [3:36:20<47:21,  7.70s/it]                                                               {'loss': 2.4994, 'grad_norm': 0.7038080096244812, 'learning_rate': 6.360950630365126e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1092.66, 'epoch': 0.63}
- 63%|█████████████▎       | 631/1000 [3:36:20<47:21,  7.70s/it] 63%|█████████████▎       | 632/1000 [3:36:28<47:14,  7.70s/it]                                                               {'loss': 2.4031, 'grad_norm': 0.7085487842559814, 'learning_rate': 6.330802833082279e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1004.48, 'epoch': 0.63}
- 63%|█████████████▎       | 632/1000 [3:36:28<47:14,  7.70s/it] 63%|█████████████▎       | 633/1000 [3:36:36<47:05,  7.70s/it]                                                               {'loss': 2.468, 'grad_norm': 0.6621479988098145, 'learning_rate': 6.300693523941482e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1254.08, 'epoch': 0.63}
- 63%|█████████████▎       | 633/1000 [3:36:36<47:05,  7.70s/it] 63%|█████████████▎       | 634/1000 [3:36:43<46:56,  7.70s/it]                                                               {'loss': 2.5287, 'grad_norm': 0.7496339678764343, 'learning_rate': 6.270623018775135e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1030.66, 'epoch': 0.63}
- 63%|█████████████▎       | 634/1000 [3:36:43<46:56,  7.70s/it] 64%|█████████████▎       | 635/1000 [3:36:51<46:52,  7.70s/it]                                                               {'loss': 2.5119, 'grad_norm': 0.788546621799469, 'learning_rate': 6.24059163300861e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 921.0, 'epoch': 0.64}
- 64%|█████████████▎       | 635/1000 [3:36:51<46:52,  7.70s/it] 64%|█████████████▎       | 636/1000 [3:36:59<46:42,  7.70s/it]                                                               {'loss': 2.3591, 'grad_norm': 0.7324807643890381, 'learning_rate': 6.210599681656933e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1029.56, 'epoch': 0.64}
- 64%|█████████████▎       | 636/1000 [3:36:59<46:42,  7.70s/it] 64%|█████████████▍       | 637/1000 [3:37:06<46:36,  7.70s/it]                                                               {'loss': 2.3566, 'grad_norm': 0.7097291946411133, 'learning_rate': 6.180647479321485e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1053.15, 'epoch': 0.64}
- 64%|█████████████▍       | 637/1000 [3:37:06<46:36,  7.70s/it] 64%|█████████████▍       | 638/1000 [3:37:14<46:29,  7.71s/it]                                                               {'loss': 2.5053, 'grad_norm': 0.8688352704048157, 'learning_rate': 6.15073534018669e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 791.26, 'epoch': 0.64}
- 64%|█████████████▍       | 638/1000 [3:37:14<46:29,  7.71s/it] 64%|█████████████▍       | 639/1000 [3:37:22<46:20,  7.70s/it]                                                               {'loss': 2.5192, 'grad_norm': 0.6615906953811646, 'learning_rate': 6.120863578016735e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1219.88, 'epoch': 0.64}
- 64%|█████████████▍       | 639/1000 [3:37:22<46:20,  7.70s/it] 64%|█████████████▍       | 640/1000 [3:37:29<46:10,  7.69s/it]                                                               {'loss': 2.3922, 'grad_norm': 0.7557700872421265, 'learning_rate': 6.091032506152274e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1001.3, 'epoch': 0.64}
- 64%|█████████████▍       | 640/1000 [3:37:29<46:10,  7.69s/it] 64%|█████████████▍       | 641/1000 [3:37:37<46:05,  7.70s/it]                                                               {'loss': 2.2964, 'grad_norm': 0.6998468041419983, 'learning_rate': 6.061242437507131e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1126.33, 'epoch': 0.64}
- 64%|█████████████▍       | 641/1000 [3:37:37<46:05,  7.70s/it] 64%|█████████████▍       | 642/1000 [3:37:45<45:54,  7.69s/it]                                                               {'loss': 2.5175, 'grad_norm': 0.8638777136802673, 'learning_rate': 6.031493684565029e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 717.55, 'epoch': 0.64}
- 64%|█████████████▍       | 642/1000 [3:37:45<45:54,  7.69s/it] 64%|█████████████▌       | 643/1000 [3:37:52<45:45,  7.69s/it]                                                               {'loss': 2.4585, 'grad_norm': 0.775001585483551, 'learning_rate': 6.00178655937631e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 973.54, 'epoch': 0.64}
- 64%|█████████████▌       | 643/1000 [3:37:52<45:45,  7.69s/it] 64%|█████████████▌       | 644/1000 [3:38:00<45:39,  7.70s/it]                                                               {'loss': 2.2242, 'grad_norm': 0.7269330620765686, 'learning_rate': 5.972121373554664e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1013.33, 'epoch': 0.64}
- 64%|█████████████▌       | 644/1000 [3:38:00<45:39,  7.70s/it] 64%|█████████████▌       | 645/1000 [3:38:08<45:33,  7.70s/it]                                                               {'loss': 2.4815, 'grad_norm': 0.8459643721580505, 'learning_rate': 5.942498438273849e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 856.57, 'epoch': 0.65}
- 64%|█████████████▌       | 645/1000 [3:38:08<45:33,  7.70s/it] 65%|█████████████▌       | 646/1000 [3:38:16<45:24,  7.70s/it]                                                               {'loss': 2.541, 'grad_norm': 0.9290534853935242, 'learning_rate': 5.9129180642644414e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 721.13, 'epoch': 0.65}
- 65%|█████████████▌       | 646/1000 [3:38:16<45:24,  7.70s/it] 65%|█████████████▌       | 647/1000 [3:38:23<45:16,  7.70s/it]                                                               {'loss': 2.3556, 'grad_norm': 0.8678464889526367, 'learning_rate': 5.883380561810563e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 803.6, 'epoch': 0.65}
- 65%|█████████████▌       | 647/1000 [3:38:23<45:16,  7.70s/it] 65%|█████████████▌       | 648/1000 [3:38:31<45:07,  7.69s/it]                                                               {'loss': 2.6034, 'grad_norm': 0.9382994771003723, 'learning_rate': 5.8538862407466425e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 785.49, 'epoch': 0.65}
- 65%|█████████████▌       | 648/1000 [3:38:31<45:07,  7.69s/it] 65%|█████████████▋       | 649/1000 [3:38:39<45:02,  7.70s/it]                                                               {'loss': 2.5946, 'grad_norm': 0.8191311359405518, 'learning_rate': 5.82443541045415e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 868.9, 'epoch': 0.65}
- 65%|█████████████▋       | 649/1000 [3:38:39<45:02,  7.70s/it] 65%|█████████████▋       | 650/1000 [3:38:46<44:53,  7.70s/it]                                                               {'loss': 2.4854, 'grad_norm': 1.0460948944091797, 'learning_rate': 5.795028379858355e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 510.14, 'epoch': 0.65}
- 65%|█████████████▋       | 650/1000 [3:38:46<44:53,  7.70s/it][2025-10-18 22:41:33,498] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 22:41:36,496] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4593720436096191
-[2025-10-18 22:41:37,937] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4411859512329102
-[2025-10-18 22:41:39,394] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4563953876495361
-[2025-10-18 22:41:40,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4184303283691406
-[2025-10-18 22:41:40,813] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:29,  6.00it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.32it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.78it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.67it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.80it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.66it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.78it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.46it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.70it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.80it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.88it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.47it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.87it/s][A
- 12%|██▉                      | 21/179 [00:07<01:03,  2.48it/s][A
- 12%|███                      | 22/179 [00:07<00:58,  2.70it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.79it/s][A
- 13%|███▎                     | 24/179 [00:08<00:54,  2.86it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.48it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.46it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|████▎                    | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.87it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.71it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.81it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.70it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.80it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.87it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.71it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.82it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.88it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.80it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.87it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.46it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.79it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.78it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:43,  2.85it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.46it/s][A
- 32%|████████                 | 58/179 [00:21<00:45,  2.68it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.79it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.87it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.66it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.78it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.85it/s][A
- 36%|█████████                | 65/179 [00:24<00:46,  2.44it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:42,  2.67it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.78it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.86it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.46it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.68it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.79it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.86it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:39,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.81it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.88it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.80it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.46it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:35,  2.70it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.88it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.47it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.80it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.87it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:33,  2.64it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:32,  2.74it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.84it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:35,  2.46it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.68it/s][A
- 53%|█████████████▎           | 95/179 [00:35<00:30,  2.80it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.48it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:29,  2.70it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.81it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.47it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.80it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.88it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.45it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.68it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.87it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.71it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.80it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.78it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.80it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.79it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.45it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.78it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.85it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.45it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.67it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.77it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.84it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.45it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.68it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.77it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.84it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.43it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.67it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.44it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.78it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.86it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.46it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.69it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.79it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.88it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|████████████████████▋   | 154/179 [00:57<00:09,  2.68it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.77it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.86it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.79it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.68it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.87it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.87it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.46it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.79it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.44it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.69it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.53it/s][A                                                               
-                                                               [A{'eval_loss': 2.310439348220825, 'eval_runtime': 68.7578, 'eval_samples_per_second': 2.851, 'eval_steps_per_second': 1.425, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.65}
- 65%|█████████████▋       | 650/1000 [3:40:02<44:53,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.53it/s][A
-                                                               [A[2025-10-18 22:42:49,578] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-650
- 65%|████████████▎      | 651/1000 [3:40:13<3:01:45, 31.25s/it]                                                               {'loss': 2.487, 'grad_norm': 0.7977380752563477, 'learning_rate': 5.765665457425102e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 805.92, 'epoch': 0.65}
- 65%|████████████▎      | 651/1000 [3:40:13<3:01:45, 31.25s/it] 65%|████████████▍      | 652/1000 [3:40:20<2:20:13, 24.18s/it]                                                               {'loss': 2.4428, 'grad_norm': 0.7626097202301025, 'learning_rate': 5.736346951157544e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 989.29, 'epoch': 0.65}
- 65%|████████████▍      | 652/1000 [3:40:20<2:20:13, 24.18s/it] 65%|████████████▍      | 653/1000 [3:40:28<1:51:08, 19.22s/it]                                                               {'loss': 2.5872, 'grad_norm': 0.8272733092308044, 'learning_rate': 5.707073168592942e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 841.95, 'epoch': 0.65}
- 65%|████████████▍      | 653/1000 [3:40:28<1:51:08, 19.22s/it] 65%|████████████▍      | 654/1000 [3:40:36<1:30:52, 15.76s/it]                                                               {'loss': 2.3164, 'grad_norm': 0.7491837739944458, 'learning_rate': 5.677844416799424e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 901.79, 'epoch': 0.65}
- 65%|████████████▍      | 654/1000 [3:40:36<1:30:52, 15.76s/it] 66%|████████████▍      | 655/1000 [3:40:43<1:16:38, 13.33s/it]                                                               {'loss': 2.6218, 'grad_norm': 0.868552029132843, 'learning_rate': 5.648661002372768e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 835.69, 'epoch': 0.66}
- 66%|████████████▍      | 655/1000 [3:40:43<1:16:38, 13.33s/it] 66%|████████████▍      | 656/1000 [3:40:51<1:06:42, 11.63s/it]                                                               {'loss': 2.528, 'grad_norm': 0.8816778659820557, 'learning_rate': 5.6195232314331766e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 741.29, 'epoch': 0.66}
- 66%|████████████▍      | 656/1000 [3:40:51<1:06:42, 11.63s/it] 66%|█████████████▊       | 657/1000 [3:40:59<59:42, 10.44s/it]                                                               {'loss': 2.3701, 'grad_norm': 0.7763086557388306, 'learning_rate': 5.590431409622081e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 857.81, 'epoch': 0.66}
- 66%|█████████████▊       | 657/1000 [3:40:59<59:42, 10.44s/it] 66%|█████████████▊       | 658/1000 [3:41:06<54:49,  9.62s/it]                                                               {'loss': 2.4683, 'grad_norm': 0.7403135895729065, 'learning_rate': 5.56138584209893e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 971.51, 'epoch': 0.66}
- 66%|█████████████▊       | 658/1000 [3:41:06<54:49,  9.62s/it] 66%|█████████████▊       | 659/1000 [3:41:14<51:19,  9.03s/it]                                                               {'loss': 2.4404, 'grad_norm': 0.9425458908081055, 'learning_rate': 5.532386833537977e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 690.0, 'epoch': 0.66}
- 66%|█████████████▊       | 659/1000 [3:41:14<51:19,  9.03s/it] 66%|█████████████▊       | 660/1000 [3:41:22<48:43,  8.60s/it]                                                               {'loss': 2.3573, 'grad_norm': 0.9051120281219482, 'learning_rate': 5.503434688125104e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 756.37, 'epoch': 0.66}
- 66%|█████████████▊       | 660/1000 [3:41:22<48:43,  8.60s/it] 66%|█████████████▉       | 661/1000 [3:41:29<46:57,  8.31s/it]                                                               {'loss': 2.5456, 'grad_norm': 0.7342925667762756, 'learning_rate': 5.474529709554612e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1105.31, 'epoch': 0.66}
- 66%|█████████████▉       | 661/1000 [3:41:29<46:57,  8.31s/it] 66%|█████████████▉       | 662/1000 [3:41:37<45:36,  8.10s/it]                                                               {'loss': 2.2785, 'grad_norm': 0.807798445224762, 'learning_rate': 5.445672201026054e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 822.76, 'epoch': 0.66}
- 66%|█████████████▉       | 662/1000 [3:41:37<45:36,  8.10s/it][2025-10-18 22:44:38,688] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42528] Loading dataset: AiAF/conversations with base_type: chat_template and prompt_style: None
-[2025-10-18 22:44:38,688] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:42528] Using chat template:
----
-{{ bos_token }}
-{% for m in messages %}
-  {% set role = 'model' if m['role']=='assistant' else 'user' %}
-  {{ '<start_of_turn>' + role + '\n' + m['content'] | trim + '<end_of_turn>\n' }}
-{% endfor %}
-{% if add_generation_prompt %}
-{{ '<start_of_turn>model\n' }}
-{% endif %}
-
----
-
-Tokenizing Prompts (num_proc=12):   0%| | 0/10000 [00:00<?, ? e[A
-Tokenizing Prompts (num_proc=12):   8%| | 834/10000 [16:25<3:00[A
-Tokenizing Prompts (num_proc=12):  17%|▏| 1667/10000 [18:07<1:1[A
-Tokenizing Prompts (num_proc=12):  25%|▎| 2501/10000 [21:12<50:[A
-Tokenizing Prompts (num_proc=12):  33%|▎| 3334/10000 [21:17<27:[A
-Tokenizing Prompts (num_proc=12):  42%|▍| 4167/10000 [22:04<17:[A
-Tokenizing Prompts (num_proc=12):  50%|▌| 5000/10000 [23:50<13:[A
-Tokenizing Prompts (num_proc=12):  58%|▌| 5834/10000 [24:04<07:[A
-Tokenizing Prompts (num_proc=12):  67%|▋| 6667/10000 [24:40<05:[A
-Tokenizing Prompts (num_proc=12):  75%|▊| 7500/10000 [25:08<03:[A
-Tokenizing Prompts (num_proc=12):  83%|▊| 8334/10000 [30:54<04:[A
-Tokenizing Prompts (num_proc=12):  92%|▉| 9167/10000 [33:12<02:[A
-Tokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [34:03<00[ATokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [34:05<00
-
-Dropping Long Sequences:   0%| | 0/10000 [00:00<?, ? examples/s[A
-Dropping Long Sequences:  10%| | 1000/10000 [00:10<01:31, 98.57[A
-Dropping Long Sequences:  20%|▏| 2000/10000 [00:18<01:11, 111.1[A
-Dropping Long Sequences:  30%|▎| 3000/10000 [00:25<00:58, 119.5[A
-Dropping Long Sequences:  40%|▍| 4000/10000 [00:34<00:49, 120.5[A
-Dropping Long Sequences:  50%|▌| 5000/10000 [00:42<00:41, 120.5[A
-Dropping Long Sequences:  60%|▌| 6000/10000 [00:50<00:32, 123.2[A
-Dropping Long Sequences:  70%|▋| 7000/10000 [00:57<00:23, 127.3[A
-Dropping Long Sequences:  80%|▊| 8000/10000 [01:05<00:15, 127.4[A
-Dropping Long Sequences:  90%|▉| 9000/10000 [01:12<00:07, 128.6[A
-Dropping Long Sequences: 100%|█| 10000/10000 [01:20<00:00, 130.[ADropping Long Sequences: 100%|█| 10000/10000 [01:20<00:00, 124.
-
-Add position_id column (Pretraining Sample Packing):   0%| | 0/[A
-Add position_id column (Pretraining Sample Packing):  49%|▍| 10[A
-Add position_id column (Pretraining Sample Packing):  98%|▉| 20[AAdd position_id column (Pretraining Sample Packing): 100%|█| 20
-[2025-10-18 23:20:12,075] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42528] Using single process for pack_parallel, running sequentially.
- 66%|███████████▎     | 663/1000 [4:17:42<61:21:00, 655.37s/it]                                                               {'loss': 2.3802, 'grad_norm': 0.7211080193519592, 'learning_rate': 5.416862465241033e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1098.39, 'epoch': 0.66}
- 66%|███████████▎     | 663/1000 [4:17:42<61:21:00, 655.37s/it] 66%|███████████▎     | 664/1000 [4:17:50<43:01:30, 460.98s/it]                                                               {'loss': 2.499, 'grad_norm': 0.7285022139549255, 'learning_rate': 5.388100804400049e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1017.1, 'epoch': 0.66}
- 66%|███████████▎     | 664/1000 [4:17:50<43:01:30, 460.98s/it] 66%|███████████▎     | 665/1000 [4:17:57<30:14:05, 324.91s/it]                                                               {'loss': 2.4436, 'grad_norm': 0.8804203271865845, 'learning_rate': 5.3593875201993174e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 753.69, 'epoch': 0.67}
- 66%|███████████▎     | 665/1000 [4:17:57<30:14:05, 324.91s/it] 67%|███████████▎     | 666/1000 [4:18:05<21:18:28, 229.67s/it]                                                               {'loss': 2.6019, 'grad_norm': 0.7476909756660461, 'learning_rate': 5.3307229138275936e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1360.07, 'epoch': 0.67}
- 67%|███████████▎     | 666/1000 [4:18:05<21:18:28, 229.67s/it] 67%|███████████▎     | 667/1000 [4:18:12<15:04:48, 163.03s/it]                                                               {'loss': 2.2821, 'grad_norm': 0.6586979627609253, 'learning_rate': 5.302107285963045e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1330.6, 'epoch': 0.67}
- 67%|███████████▎     | 667/1000 [4:18:12<15:04:48, 163.03s/it] 67%|███████████▎     | 668/1000 [4:18:20<10:43:58, 116.38s/it]                                                               {'loss': 2.3872, 'grad_norm': 0.8167933821678162, 'learning_rate': 5.273540936770058e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 881.19, 'epoch': 0.67}
- 67%|███████████▎     | 668/1000 [4:18:20<10:43:58, 116.38s/it] 67%|████████████▋      | 669/1000 [4:18:27<7:41:54, 83.73s/it]                                                               {'loss': 2.3993, 'grad_norm': 0.6900964975357056, 'learning_rate': 5.245024165896126e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1086.77, 'epoch': 0.67}
- 67%|████████████▋      | 669/1000 [4:18:27<7:41:54, 83.73s/it] 67%|████████████▋      | 670/1000 [4:18:35<5:34:48, 60.87s/it]                                                               {'loss': 2.0712, 'grad_norm': 0.7222774028778076, 'learning_rate': 5.2165572724686754e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 875.14, 'epoch': 0.67}
- 67%|████████████▋      | 670/1000 [4:18:35<5:34:48, 60.87s/it] 67%|████████████▋      | 671/1000 [4:18:42<4:06:09, 44.89s/it]                                                               {'loss': 2.3397, 'grad_norm': 0.734351634979248, 'learning_rate': 5.1881405550919493e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1008.32, 'epoch': 0.67}
- 67%|████████████▋      | 671/1000 [4:18:42<4:06:09, 44.89s/it] 67%|████████████▊      | 672/1000 [4:18:50<3:04:14, 33.70s/it]                                                               {'loss': 2.6499, 'grad_norm': 0.7870229482650757, 'learning_rate': 5.1597743118438726e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1075.75, 'epoch': 0.67}
- 67%|████████████▊      | 672/1000 [4:18:50<3:04:14, 33.70s/it] 67%|████████████▊      | 673/1000 [4:18:58<2:21:02, 25.88s/it]                                                               {'loss': 2.4671, 'grad_norm': 0.7094972729682922, 'learning_rate': 5.1314588402729044e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1227.84, 'epoch': 0.67}
- 67%|████████████▊      | 673/1000 [4:18:58<2:21:02, 25.88s/it] 67%|████████████▊      | 674/1000 [4:19:05<1:50:52, 20.41s/it]                                                               {'loss': 2.4027, 'grad_norm': 0.7383370399475098, 'learning_rate': 5.103194437394952e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1240.45, 'epoch': 0.67}
- 67%|████████████▊      | 674/1000 [4:19:05<1:50:52, 20.41s/it] 68%|████████████▊      | 675/1000 [4:19:13<1:29:43, 16.56s/it]                                                               {'loss': 2.347, 'grad_norm': 0.6504874229431152, 'learning_rate': 5.074981399690218e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1293.84, 'epoch': 0.68}
- 68%|████████████▊      | 675/1000 [4:19:13<1:29:43, 16.56s/it] 68%|████████████▊      | 676/1000 [4:19:21<1:15:03, 13.90s/it]                                                               {'loss': 2.3537, 'grad_norm': 0.6710034608840942, 'learning_rate': 5.0468200231001286e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1245.18, 'epoch': 0.68}
- 68%|████████████▊      | 676/1000 [4:19:21<1:15:03, 13.90s/it] 68%|████████████▊      | 677/1000 [4:19:28<1:04:42, 12.02s/it]                                                               {'loss': 2.1318, 'grad_norm': 0.7696754336357117, 'learning_rate': 5.018710603024187e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 863.76, 'epoch': 0.68}
- 68%|████████████▊      | 677/1000 [4:19:28<1:04:42, 12.02s/it] 68%|██████████████▏      | 678/1000 [4:19:36<57:30, 10.72s/it]                                                               {'loss': 2.309, 'grad_norm': 0.7506178617477417, 'learning_rate': 4.9906534343169144e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1096.75, 'epoch': 0.68}
- 68%|██████████████▏      | 678/1000 [4:19:36<57:30, 10.72s/it] 68%|██████████████▎      | 679/1000 [4:19:44<52:28,  9.81s/it]                                                               {'loss': 2.3123, 'grad_norm': 0.6553046703338623, 'learning_rate': 4.962648811284738e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1296.36, 'epoch': 0.68}
- 68%|██████████████▎      | 679/1000 [4:19:44<52:28,  9.81s/it] 68%|██████████████▎      | 680/1000 [4:19:51<48:52,  9.17s/it]                                                               {'loss': 2.7106, 'grad_norm': 0.9204152226448059, 'learning_rate': 4.934697027682894e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 924.43, 'epoch': 0.68}
- 68%|██████████████▎      | 680/1000 [4:19:51<48:52,  9.17s/it] 68%|██████████████▎      | 681/1000 [4:19:59<46:20,  8.72s/it]                                                               {'loss': 2.408, 'grad_norm': 0.7454817891120911, 'learning_rate': 4.9067983767123736e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1104.26, 'epoch': 0.68}
- 68%|██████████████▎      | 681/1000 [4:19:59<46:20,  8.72s/it] 68%|██████████████▎      | 682/1000 [4:20:07<44:34,  8.41s/it]                                                               {'loss': 2.473, 'grad_norm': 0.7237149477005005, 'learning_rate': 4.8789531510168163e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1279.66, 'epoch': 0.68}
- 68%|██████████████▎      | 682/1000 [4:20:07<44:34,  8.41s/it] 68%|██████████████▎      | 683/1000 [4:20:14<43:17,  8.19s/it]                                                               {'loss': 2.5136, 'grad_norm': 0.7800313830375671, 'learning_rate': 4.851161642679466e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 997.89, 'epoch': 0.68}
- 68%|██████████████▎      | 683/1000 [4:20:14<43:17,  8.19s/it] 68%|██████████████▎      | 684/1000 [4:20:22<42:21,  8.04s/it]                                                               {'loss': 2.5166, 'grad_norm': 0.753099262714386, 'learning_rate': 4.8234241432200965e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1172.22, 'epoch': 0.68}
- 68%|██████████████▎      | 684/1000 [4:20:22<42:21,  8.04s/it] 68%|██████████████▍      | 685/1000 [4:20:30<41:40,  7.94s/it]                                                               {'loss': 2.399, 'grad_norm': 0.9544991850852966, 'learning_rate': 4.795740943591955e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 685.72, 'epoch': 0.69}
- 68%|██████████████▍      | 685/1000 [4:20:30<41:40,  7.94s/it] 69%|██████████████▍      | 686/1000 [4:20:37<41:09,  7.87s/it]                                                               {'loss': 2.3961, 'grad_norm': 0.7699408531188965, 'learning_rate': 4.768112334178699e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 961.78, 'epoch': 0.69}
- 69%|██████████████▍      | 686/1000 [4:20:37<41:09,  7.87s/it] 69%|██████████████▍      | 687/1000 [4:20:45<40:48,  7.82s/it]                                                               {'loss': 2.507, 'grad_norm': 0.7202984094619751, 'learning_rate': 4.74053860479137e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1258.31, 'epoch': 0.69}
- 69%|██████████████▍      | 687/1000 [4:20:45<40:48,  7.82s/it] 69%|██████████████▍      | 688/1000 [4:20:53<40:27,  7.78s/it]                                                               {'loss': 2.4226, 'grad_norm': 0.8358380794525146, 'learning_rate': 4.7130200446653475e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 787.58, 'epoch': 0.69}
- 69%|██████████████▍      | 688/1000 [4:20:53<40:27,  7.78s/it] 69%|██████████████▍      | 689/1000 [4:21:00<40:11,  7.75s/it]                                                               {'loss': 2.3909, 'grad_norm': 0.7951445579528809, 'learning_rate': 4.6855569424572955e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 786.14, 'epoch': 0.69}
- 69%|██████████████▍      | 689/1000 [4:21:01<40:11,  7.75s/it] 69%|██████████████▍      | 690/1000 [4:21:08<39:59,  7.74s/it]                                                               {'loss': 2.4717, 'grad_norm': 0.8480710387229919, 'learning_rate': 4.65814958624217e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 826.7, 'epoch': 0.69}
- 69%|██████████████▍      | 690/1000 [4:21:08<39:59,  7.74s/it] 69%|██████████████▌      | 691/1000 [4:21:16<39:48,  7.73s/it]                                                               {'loss': 2.7013, 'grad_norm': 0.8417328596115112, 'learning_rate': 4.630798263510162e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 894.32, 'epoch': 0.69}
- 69%|██████████████▌      | 691/1000 [4:21:16<39:48,  7.73s/it] 69%|██████████████▌      | 692/1000 [4:21:24<39:37,  7.72s/it]                                                               {'loss': 2.2546, 'grad_norm': 0.7272226810455322, 'learning_rate': 4.6035032611637094e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 949.6, 'epoch': 0.69}
- 69%|██████████████▌      | 692/1000 [4:21:24<39:37,  7.72s/it] 69%|██████████████▌      | 693/1000 [4:21:31<39:32,  7.73s/it]                                                               {'loss': 2.4418, 'grad_norm': 0.7242093086242676, 'learning_rate': 4.5762648655144666e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1210.7, 'epoch': 0.69}
- 69%|██████████████▌      | 693/1000 [4:21:31<39:32,  7.73s/it] 69%|██████████████▌      | 694/1000 [4:21:39<39:21,  7.72s/it]                                                               {'loss': 2.2487, 'grad_norm': 0.7796218395233154, 'learning_rate': 4.549083362280317e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 935.35, 'epoch': 0.69}
- 69%|██████████████▌      | 694/1000 [4:21:39<39:21,  7.72s/it] 70%|██████████████▌      | 695/1000 [4:21:47<39:14,  7.72s/it]                                                               {'loss': 2.2746, 'grad_norm': 0.9938075542449951, 'learning_rate': 4.5219590365823714e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1190.05, 'epoch': 0.69}
- 70%|██████████████▌      | 695/1000 [4:21:47<39:14,  7.72s/it] 70%|██████████████▌      | 696/1000 [4:21:55<39:08,  7.73s/it]                                                               {'loss': 2.4476, 'grad_norm': 0.751891553401947, 'learning_rate': 4.494892172941965e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1194.32, 'epoch': 0.7}
- 70%|██████████████▌      | 696/1000 [4:21:55<39:08,  7.73s/it] 70%|██████████████▋      | 697/1000 [4:22:02<38:59,  7.72s/it]                                                               {'loss': 2.626, 'grad_norm': 0.800341010093689, 'learning_rate': 4.467883055277695e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 951.33, 'epoch': 0.7}
- 70%|██████████████▋      | 697/1000 [4:22:02<38:59,  7.72s/it] 70%|██████████████▋      | 698/1000 [4:22:10<38:51,  7.72s/it]                                                               {'loss': 2.4951, 'grad_norm': 0.7806605100631714, 'learning_rate': 4.440931966902418e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 921.93, 'epoch': 0.7}
- 70%|██████████████▋      | 698/1000 [4:22:10<38:51,  7.72s/it] 70%|██████████████▋      | 699/1000 [4:22:18<38:39,  7.71s/it]                                                               {'loss': 2.298, 'grad_norm': 0.8109032511711121, 'learning_rate': 4.414039190520308e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 969.68, 'epoch': 0.7}
- 70%|██████████████▋      | 699/1000 [4:22:18<38:39,  7.71s/it] 70%|██████████████▋      | 700/1000 [4:22:25<38:30,  7.70s/it]                                                               {'loss': 2.3788, 'grad_norm': 0.9292984008789062, 'learning_rate': 4.387205008223854e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 747.54, 'epoch': 0.7}
- 70%|██████████████▋      | 700/1000 [4:22:25<38:30,  7.70s/it][2025-10-18 23:25:12,428] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 23:25:15,444] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4222972393035889
-[2025-10-18 23:25:16,894] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4485900402069092
-[2025-10-18 23:25:18,304] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.410125494003296
-[2025-10-18 23:25:19,697] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3927481174468994
-[2025-10-18 23:25:19,698] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.18it/s][A
-  2%|▍                         | 3/179 [00:00<00:41,  4.29it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.73it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.49it/s][A
-  4%|█                         | 7/179 [00:02<01:05,  2.63it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.77it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.76it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.85it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.70it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.80it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.88it/s][A
-  9%|██▎                      | 17/179 [00:06<01:06,  2.45it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▉                      | 21/179 [00:07<01:03,  2.48it/s][A
- 12%|███                      | 22/179 [00:08<00:58,  2.70it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|████▎                    | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.86it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.69it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.71it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.81it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.89it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.47it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.70it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.80it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.88it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.48it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:46,  2.81it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.88it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.46it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.79it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.69it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.87it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:45,  2.69it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.80it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.86it/s][A
- 34%|████████▌                | 61/179 [00:22<00:48,  2.46it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.87it/s][A
- 36%|█████████                | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.87it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.47it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.69it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.88it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.70it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.81it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.88it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.48it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.70it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.80it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.88it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.48it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:35,  2.70it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:37,  2.48it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.70it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.81it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.89it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.48it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:33,  2.69it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.79it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.86it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.47it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.69it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:30,  2.80it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.70it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.80it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.47it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.70it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.81it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.87it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.86it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.69it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.79it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.80it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.87it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.69it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.79it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.88it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.79it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.47it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.80it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.69it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.68it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.79it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.45it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:15,  2.68it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.78it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.85it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.46it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.68it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.79it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.87it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.47it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.69it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.38it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:11,  2.62it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.74it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.83it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.45it/s][A
- 86%|███████████████���████▋   | 154/179 [00:56<00:09,  2.66it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.77it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.86it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.69it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.80it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.46it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.78it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.86it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.45it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.67it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.79it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.71it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.87it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.42it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.65it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.74it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.83it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.43it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.68it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.50it/s][A                                                               
-                                                               [A{'eval_loss': 2.2877044677734375, 'eval_runtime': 68.6926, 'eval_samples_per_second': 2.853, 'eval_steps_per_second': 1.427, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.7}
- 70%|██████████████▋      | 700/1000 [4:23:41<38:30,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.50it/s][A
-                                                               [A[2025-10-18 23:26:28,398] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-700
- 70%|█████████████▎     | 701/1000 [4:23:51<2:35:37, 31.23s/it]                                                               {'loss': 2.3064, 'grad_norm': 0.8486847877502441, 'learning_rate': 4.360429701490934e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 839.36, 'epoch': 0.7}
- 70%|█████████████▎     | 701/1000 [4:23:51<2:35:37, 31.23s/it] 70%|█████████████▎     | 702/1000 [4:23:59<1:59:59, 24.16s/it]                                                               {'loss': 2.3641, 'grad_norm': 0.8147217631340027, 'learning_rate': 4.333713551181852e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 928.73, 'epoch': 0.7}
- 70%|█████████████▎     | 702/1000 [4:23:59<1:59:59, 24.16s/it] 70%|█████████████▎     | 703/1000 [4:24:07<1:35:07, 19.22s/it]                                                               {'loss': 1.8423, 'grad_norm': 1.0263283252716064, 'learning_rate': 4.307056837536373e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 653.8, 'epoch': 0.7}
- 70%|█████████████▎     | 703/1000 [4:24:07<1:35:07, 19.22s/it] 70%|█████████████▍     | 704/1000 [4:24:14<1:17:38, 15.74s/it]                                                               {'loss': 2.3799, 'grad_norm': 0.8192951679229736, 'learning_rate': 4.2804598401708175e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 996.15, 'epoch': 0.7}
- 70%|█████████████▍     | 704/1000 [4:24:14<1:17:38, 15.74s/it] 70%|█████████████▍     | 705/1000 [4:24:22<1:05:29, 13.32s/it]                                                               {'loss': 2.3174, 'grad_norm': 0.7945936918258667, 'learning_rate': 4.253922838075095e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 986.51, 'epoch': 0.7}
- 70%|█████████████▍     | 705/1000 [4:24:22<1:05:29, 13.32s/it] 71%|██████████████▊      | 706/1000 [4:24:30<57:01, 11.64s/it]                                                               {'loss': 2.5377, 'grad_norm': 0.7749230861663818, 'learning_rate': 4.227446109609809e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1047.09, 'epoch': 0.71}
- 71%|██████████████▊      | 706/1000 [4:24:30<57:01, 11.64s/it] 71%|██████████████▊      | 707/1000 [4:24:38<51:07, 10.47s/it]                                                               {'loss': 2.3519, 'grad_norm': 0.7944381833076477, 'learning_rate': 4.2010299325033034e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1058.97, 'epoch': 0.71}
- 71%|██████████████▊      | 707/1000 [4:24:38<51:07, 10.47s/it] 71%|██████████████▊      | 708/1000 [4:24:45<46:51,  9.63s/it]                                                               {'loss': 2.3223, 'grad_norm': 0.8416022658348083, 'learning_rate': 4.17467458384878e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 799.54, 'epoch': 0.71}
- 71%|██████████████▊      | 708/1000 [4:24:45<46:51,  9.63s/it] 71%|██████████████▉      | 709/1000 [4:24:53<43:54,  9.05s/it]                                                               {'loss': 2.3727, 'grad_norm': 0.8128032088279724, 'learning_rate': 4.1483803401013796e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 907.22, 'epoch': 0.71}
- 71%|██████████████▉      | 709/1000 [4:24:53<43:54,  9.05s/it] 71%|██████████████▉      | 710/1000 [4:25:01<41:47,  8.65s/it]                                                               {'loss': 2.4405, 'grad_norm': 0.848017692565918, 'learning_rate': 4.12214747707527e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 817.27, 'epoch': 0.71}
- 71%|██████████████▉      | 710/1000 [4:25:01<41:47,  8.65s/it] 71%|██████████████▉      | 711/1000 [4:25:08<40:18,  8.37s/it]                                                               {'loss': 2.2228, 'grad_norm': 0.7430258393287659, 'learning_rate': 4.0959762699407766e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1073.54, 'epoch': 0.71}
- 71%|██████████████▉      | 711/1000 [4:25:08<40:18,  8.37s/it] 71%|██████████████▉      | 712/1000 [4:25:16<39:12,  8.17s/it]                                                               {'loss': 2.325, 'grad_norm': 0.7409115433692932, 'learning_rate': 4.0698669932214727e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1096.73, 'epoch': 0.71}
- 71%|██████████████▉      | 712/1000 [4:25:16<39:12,  8.17s/it] 71%|██████████████▉      | 713/1000 [4:25:24<38:24,  8.03s/it]                                                               {'loss': 2.5166, 'grad_norm': 0.7323837876319885, 'learning_rate': 4.043819920791322e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1251.57, 'epoch': 0.71}
- 71%|██████████████▉      | 713/1000 [4:25:24<38:24,  8.03s/it] 71%|██████████████▉      | 714/1000 [4:25:31<37:48,  7.93s/it]                                                               {'loss': 2.3298, 'grad_norm': 0.863035261631012, 'learning_rate': 4.0178353258717804e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 864.28, 'epoch': 0.71}
- 71%|██████████████▉      | 714/1000 [4:25:31<37:48,  7.93s/it] 72%|███████████████      | 715/1000 [4:25:39<37:21,  7.86s/it]                                                               {'loss': 2.388, 'grad_norm': 0.7767102122306824, 'learning_rate': 3.991913481028965e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1092.36, 'epoch': 0.71}
- 72%|███████████████      | 715/1000 [4:25:39<37:21,  7.86s/it] 72%|███████████████      | 716/1000 [4:25:47<36:54,  7.80s/it]                                                               {'loss': 2.4033, 'grad_norm': 0.802910566329956, 'learning_rate': 3.966054658170754e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 959.45, 'epoch': 0.72}
- 72%|███████████████      | 716/1000 [4:25:47<36:54,  7.80s/it] 72%|███████████████      | 717/1000 [4:25:55<36:39,  7.77s/it]                                                               {'loss': 2.5494, 'grad_norm': 0.9419648051261902, 'learning_rate': 3.940259128543967e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 820.55, 'epoch': 0.72}
- 72%|███████████████      | 717/1000 [4:25:55<36:39,  7.77s/it] 72%|███████████████      | 718/1000 [4:26:02<36:26,  7.75s/it]                                                               {'loss': 2.2451, 'grad_norm': 0.7525441646575928, 'learning_rate': 3.9145271627314986e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1071.65, 'epoch': 0.72}
- 72%|███████████████      | 718/1000 [4:26:02<36:26,  7.75s/it] 72%|███████████████      | 719/1000 [4:26:10<36:15,  7.74s/it]                                                               {'loss': 2.2826, 'grad_norm': 0.8275830745697021, 'learning_rate': 3.8888590306494974e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 913.59, 'epoch': 0.72}
- 72%|███████████████      | 719/1000 [4:26:10<36:15,  7.74s/it] 72%|███████████████      | 720/1000 [4:26:18<36:02,  7.72s/it]                                                               {'loss': 2.6075, 'grad_norm': 1.0837161540985107, 'learning_rate': 3.8632550015445256e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 644.62, 'epoch': 0.72}
- 72%|███████████████      | 720/1000 [4:26:18<36:02,  7.72s/it] 72%|███████████████▏     | 721/1000 [4:26:25<35:53,  7.72s/it]                                                               {'loss': 2.4028, 'grad_norm': 0.7942420244216919, 'learning_rate': 3.8377153439907266e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1102.8, 'epoch': 0.72}
- 72%|███████████████▏     | 721/1000 [4:26:25<35:53,  7.72s/it] 72%|███████████████▏     | 722/1000 [4:26:33<35:44,  7.71s/it]                                                               {'loss': 2.4074, 'grad_norm': 0.8007482886314392, 'learning_rate': 3.81224032588703e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 999.97, 'epoch': 0.72}
- 72%|███████████████▏     | 722/1000 [4:26:33<35:44,  7.71s/it] 72%|███████████████▏     | 723/1000 [4:26:41<35:37,  7.72s/it]                                                               {'loss': 2.5139, 'grad_norm': 0.8451229929924011, 'learning_rate': 3.786830214454315e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 973.46, 'epoch': 0.72}
- 72%|███████████████▏     | 723/1000 [4:26:41<35:37,  7.72s/it] 72%|███████████████▏     | 724/1000 [4:26:48<35:27,  7.71s/it]                                                               {'loss': 2.5681, 'grad_norm': 0.9543566703796387, 'learning_rate': 3.7614852762326305e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1039.83, 'epoch': 0.72}
- 72%|███████████████▏     | 724/1000 [4:26:48<35:27,  7.71s/it] 72%|███████████████▏     | 725/1000 [4:26:56<35:18,  7.70s/it]                                                               {'loss': 2.2317, 'grad_norm': 0.8221555352210999, 'learning_rate': 3.736205777078381e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 834.79, 'epoch': 0.72}
- 72%|███████████████▏     | 725/1000 [4:26:56<35:18,  7.70s/it] 73%|███████████████▏     | 726/1000 [4:27:04<35:11,  7.71s/it]                                                               {'loss': 2.3999, 'grad_norm': 0.8767339587211609, 'learning_rate': 3.710991982161555e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 724.51, 'epoch': 0.73}
- 73%|███████████████▏     | 726/1000 [4:27:04<35:11,  7.71s/it] 73%|███████████████▎     | 727/1000 [4:27:12<35:01,  7.70s/it]                                                               {'loss': 2.0564, 'grad_norm': 0.7584757804870605, 'learning_rate': 3.6858441559629306e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1030.13, 'epoch': 0.73}
- 73%|███████████████▎     | 727/1000 [4:27:12<35:01,  7.70s/it] 73%|███████████████▎     | 728/1000 [4:27:19<34:53,  7.70s/it]                                                               {'loss': 2.3786, 'grad_norm': 0.7719346880912781, 'learning_rate': 3.6607625622713e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1081.86, 'epoch': 0.73}
- 73%|███████████████▎     | 728/1000 [4:27:19<34:53,  7.70s/it] 73%|███████████████▎     | 729/1000 [4:27:27<34:48,  7.71s/it]                                                               {'loss': 2.3073, 'grad_norm': 0.7996840476989746, 'learning_rate': 3.63574746418072e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 927.76, 'epoch': 0.73}
- 73%|███████████████▎     | 729/1000 [4:27:27<34:48,  7.71s/it] 73%|███████████████▎     | 730/1000 [4:27:35<34:41,  7.71s/it]                                                               {'loss': 2.3899, 'grad_norm': 0.773540198802948, 'learning_rate': 3.610799124087725e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 978.75, 'epoch': 0.73}
- 73%|███████████████▎     | 730/1000 [4:27:35<34:41,  7.71s/it] 73%|███████████████▎     | 731/1000 [4:27:42<34:36,  7.72s/it]                                                               {'loss': 2.6746, 'grad_norm': 0.6639180779457092, 'learning_rate': 3.585917803688603e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1333.5, 'epoch': 0.73}
- 73%|███████████████▎     | 731/1000 [4:27:42<34:36,  7.72s/it] 73%|███████████████▎     | 732/1000 [4:27:50<34:28,  7.72s/it]                                                               {'loss': 2.4636, 'grad_norm': 0.7918578386306763, 'learning_rate': 3.5611037639766265e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1032.03, 'epoch': 0.73}
- 73%|███████████████▎     | 732/1000 [4:27:50<34:28,  7.72s/it] 73%|███████████████▍     | 733/1000 [4:27:58<34:21,  7.72s/it]                                                               {'loss': 2.4049, 'grad_norm': 0.9314700365066528, 'learning_rate': 3.5363572652393326e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 712.63, 'epoch': 0.73}
- 73%|███████████████▍     | 733/1000 [4:27:58<34:21,  7.72s/it] 73%|███████████████▍     | 734/1000 [4:28:06<34:10,  7.71s/it]                                                               {'loss': 2.281, 'grad_norm': 0.8572481274604797, 'learning_rate': 3.511678567055786e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 856.79, 'epoch': 0.73}
- 73%|███████████████▍     | 734/1000 [4:28:06<34:10,  7.71s/it] 74%|███████████████▍     | 735/1000 [4:28:13<34:01,  7.71s/it]                                                               {'loss': 2.6364, 'grad_norm': 0.9266046285629272, 'learning_rate': 3.487067928293848e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 892.2, 'epoch': 0.73}
- 74%|███████████████▍     | 735/1000 [4:28:13<34:01,  7.71s/it] 74%|███████████████▍     | 736/1000 [4:28:21<33:53,  7.70s/it]                                                               {'loss': 2.3035, 'grad_norm': 0.7125644683837891, 'learning_rate': 3.4625256071074773e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1049.08, 'epoch': 0.74}
- 74%|███████████████▍     | 736/1000 [4:28:21<33:53,  7.70s/it] 74%|███████████████▍     | 737/1000 [4:28:29<33:46,  7.70s/it]                                                               {'loss': 2.4367, 'grad_norm': 0.77367103099823, 'learning_rate': 3.4380518609340076e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1115.23, 'epoch': 0.74}
- 74%|███████████████▍     | 737/1000 [4:28:29<33:46,  7.70s/it] 74%|███████████████▍     | 738/1000 [4:28:36<33:37,  7.70s/it]                                                               {'loss': 2.4499, 'grad_norm': 0.709686815738678, 'learning_rate': 3.4136469464914575e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1304.05, 'epoch': 0.74}
- 74%|███████████████▍     | 738/1000 [4:28:36<33:37,  7.70s/it] 74%|███████████████▌     | 739/1000 [4:28:44<33:29,  7.70s/it]                                                               {'loss': 2.3582, 'grad_norm': 1.0302982330322266, 'learning_rate': 3.389311119775828e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 610.63, 'epoch': 0.74}
- 74%|███████████████▌     | 739/1000 [4:28:44<33:29,  7.70s/it] 74%|███████████████▌     | 740/1000 [4:28:52<33:23,  7.71s/it]                                                               {'loss': 2.412, 'grad_norm': 0.9974873065948486, 'learning_rate': 3.3650446360584275e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 636.55, 'epoch': 0.74}
- 74%|███████████████▌     | 740/1000 [4:28:52<33:23,  7.71s/it] 74%|███████████████▌     | 741/1000 [4:28:59<33:14,  7.70s/it]                                                               {'loss': 2.5027, 'grad_norm': 0.9797143340110779, 'learning_rate': 3.340847749883191e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 691.81, 'epoch': 0.74}
- 74%|███████████████▌     | 741/1000 [4:28:59<33:14,  7.70s/it] 74%|███████████████▌     | 742/1000 [4:29:07<33:06,  7.70s/it]                                                               {'loss': 2.5185, 'grad_norm': 0.831883430480957, 'learning_rate': 3.316720715064e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 942.79, 'epoch': 0.74}
- 74%|███████████████▌     | 742/1000 [4:29:07<33:06,  7.70s/it] 74%|███████████████▌     | 743/1000 [4:29:15<32:58,  7.70s/it]                                                               {'loss': 2.3623, 'grad_norm': 0.7420775890350342, 'learning_rate': 3.292663784682036e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 956.57, 'epoch': 0.74}
- 74%|███████████████▌     | 743/1000 [4:29:15<32:58,  7.70s/it] 74%|███��███████████▌     | 744/1000 [4:29:23<32:52,  7.70s/it]                                                               {'loss': 2.4405, 'grad_norm': 0.7084334492683411, 'learning_rate': 3.268677211083109e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1120.62, 'epoch': 0.74}
- 74%|███████████████▌     | 744/1000 [4:29:23<32:52,  7.70s/it] 74%|███████████████▋     | 745/1000 [4:29:30<32:42,  7.70s/it]                                                               {'loss': 2.2235, 'grad_norm': 0.8280802965164185, 'learning_rate': 3.2447612458750365e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 832.43, 'epoch': 0.74}
- 74%|███████████████▋     | 745/1000 [4:29:30<32:42,  7.70s/it] 75%|███████████████▋     | 746/1000 [4:29:38<32:37,  7.70s/it]                                                               {'loss': 2.4574, 'grad_norm': 0.7158889770507812, 'learning_rate': 3.2209161399249674e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1203.29, 'epoch': 0.75}
- 75%|███████████████▋     | 746/1000 [4:29:38<32:37,  7.70s/it] 75%|███████████████▋     | 747/1000 [4:29:46<32:27,  7.70s/it]                                                               {'loss': 2.3446, 'grad_norm': 0.8821169137954712, 'learning_rate': 3.197142143356787e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 777.12, 'epoch': 0.75}
- 75%|███████████████▋     | 747/1000 [4:29:46<32:27,  7.70s/it] 75%|███████████████▋     | 748/1000 [4:29:53<32:21,  7.70s/it]                                                               {'loss': 2.5396, 'grad_norm': 0.8297573924064636, 'learning_rate': 3.173439505548462e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 887.14, 'epoch': 0.75}
- 75%|███████████████▋     | 748/1000 [4:29:53<32:21,  7.70s/it] 75%|███████████████▋     | 749/1000 [4:30:01<32:11,  7.70s/it]                                                               {'loss': 2.3952, 'grad_norm': 0.8457604646682739, 'learning_rate': 3.149808475129452e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 886.71, 'epoch': 0.75}
- 75%|███████████████▋     | 749/1000 [4:30:01<32:11,  7.70s/it] 75%|███████████████▊     | 750/1000 [4:30:09<32:05,  7.70s/it]                                                               {'loss': 2.2126, 'grad_norm': 0.7009559273719788, 'learning_rate': 3.126249299978086e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1049.15, 'epoch': 0.75}
- 75%|███████████████▊     | 750/1000 [4:30:09<32:05,  7.70s/it][2025-10-18 23:32:55,843] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 23:32:58,856] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.422849178314209
-[2025-10-18 23:33:00,090] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.2339284420013428
-[2025-10-18 23:33:01,306] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.2156398296356201
-[2025-10-18 23:33:02,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1947009563446045
-[2025-10-18 23:33:02,502] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.22it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.33it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.80it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.23it/s][A
-  3%|▊                         | 6/179 [00:02<01:08,  2.51it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.77it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.41it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.65it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.76it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.85it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.67it/s][A
-  8%|██                       | 15/179 [00:05<00:59,  2.78it/s][A
-  9%|██▏                      | 16/179 [00:05<00:57,  2.85it/s][A
-  9%|██▎                      | 17/179 [00:06<01:06,  2.45it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.68it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.78it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.87it/s][A
- 12%|██▉                      | 21/179 [00:07<01:04,  2.44it/s][A
- 12%|███                      | 22/179 [00:08<00:58,  2.66it/s][A
- 13%|███▏                     | 23/179 [00:08<00:56,  2.78it/s][A
- 13%|███▎                     | 24/179 [00:08<00:54,  2.86it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.46it/s][A
- 15%|███▋                     | 26/179 [00:09<00:57,  2.68it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.78it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.86it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|████▎                    | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.86it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.44it/s][A
- 19%|████▋                    | 34/179 [00:12<00:54,  2.67it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.77it/s][A
- 20%|█████                    | 36/179 [00:13<00:50,  2.85it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.47it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.68it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:50,  2.79it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.87it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.47it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:51,  2.68it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.79it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:47,  2.86it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.79it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.85it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:53,  2.45it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.79it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.86it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.46it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.88it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.71it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.81it/s][A
- 34%|████████▍                | 60/179 [00:22<00:41,  2.88it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.86it/s][A
- 36%|█████████                | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:42,  2.68it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.78it/s][A
- 38%|█████████▍               | 68/179 [00:25<00:38,  2.85it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.46it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.67it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.78it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.85it/s][A
- 41%|██████████▏              | 73/179 [00:27<00:43,  2.46it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:39,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|██████████▌              | 76/179 [00:28<00:35,  2.87it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.46it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.68it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.78it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.86it/s][A
- 45%|███████████▎             | 81/179 [00:30<00:39,  2.46it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:36,  2.68it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.46it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.68it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:33,  2.78it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.85it/s][A
- 50%|████████████▍            | 89/179 [00:33<00:40,  2.23it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:35,  2.49it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:33,  2.63it/s][A
- 51%|████████████▊            | 92/179 [00:34<00:31,  2.74it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:35,  2.40it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:32,  2.63it/s][A
- 53%|█████████████▎           | 95/179 [00:35<00:30,  2.74it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:29,  2.83it/s][A
- 54%|█████████████▌           | 97/179 [00:36<00:33,  2.45it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.68it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.78it/s][A
- 56%|█████████████▍          | 100/179 [00:37<00:27,  2.86it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.69it/s][A
- 58%|█████████████▊          | 103/179 [00:38<00:27,  2.79it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|██████████████          | 105/179 [00:39<00:30,  2.47it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.70it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.79it/s][A
- 60%|██████████████▍         | 108/179 [00:40<00:24,  2.85it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.47it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.70it/s][A
- 62%|██████████████▉         | 111/179 [00:41<00:24,  2.79it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:42<00:26,  2.47it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.81it/s][A
- 65%|███████████████▌        | 116/179 [00:43<00:21,  2.87it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.69it/s][A
- 66%|███████████████▉        | 119/179 [00:44<00:21,  2.80it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.86it/s][A
- 68%|████████████████▏       | 121/179 [00:45<00:23,  2.46it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.70it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:19,  2.80it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.86it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.46it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.68it/s][A
- 71%|███████���█████████       | 127/179 [00:47<00:18,  2.79it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:48<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.68it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.68it/s][A
- 75%|██████████████████      | 135/179 [00:50<00:15,  2.78it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.45it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.67it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.77it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.85it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.45it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.67it/s][A
- 80%|███████████████████▏    | 143/179 [00:53<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.84it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.45it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.66it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.77it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.85it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.46it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.68it/s][A
- 84%|████████████████████▏   | 151/179 [00:56<00:10,  2.78it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.87it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|████████████████████▋   | 154/179 [00:57<00:09,  2.70it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.87it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.70it/s][A
- 89%|█████████████████████▎  | 159/179 [00:59<00:07,  2.78it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.86it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.45it/s][A
- 91%|█████████████████████▋  | 162/179 [01:00<00:06,  2.67it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.77it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.85it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.44it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.68it/s][A
- 93%|██████████████████████▍ | 167/179 [01:02<00:04,  2.77it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.86it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.46it/s][A
- 95%|██████████████████████▊ | 170/179 [01:03<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.87it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.47it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|███████████████████████▍| 175/179 [01:05<00:01,  2.79it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.47it/s][A
- 99%|███████████████████████▊| 178/179 [01:06<00:00,  2.69it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A                                                               
-                                                               [A{'eval_loss': 2.274824857711792, 'eval_runtime': 68.704, 'eval_samples_per_second': 2.853, 'eval_steps_per_second': 1.426, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.75}
- 75%|███████████████▊     | 750/1000 [4:31:24<32:05,  7.70s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A
-                                                               [A[2025-10-18 23:34:11,213] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-750
- 75%|██████████████▎    | 751/1000 [4:31:34<2:08:45, 31.03s/it]                                                               {'loss': 2.3984, 'grad_norm': 0.6870959997177124, 'learning_rate': 3.102762227218957e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 1240.91, 'epoch': 0.75}
- 75%|██████████████▎    | 751/1000 [4:31:34<2:08:45, 31.03s/it] 75%|██████████████▎    | 752/1000 [4:31:42<1:39:14, 24.01s/it]                                                               {'loss': 2.4711, 'grad_norm': 0.8758698105812073, 'learning_rate': 3.079347503220351e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 856.44, 'epoch': 0.75}
- 75%|██████████████▎    | 752/1000 [4:31:42<1:39:14, 24.01s/it] 75%|██████████████▎    | 753/1000 [4:31:49<1:18:34, 19.09s/it]                                                               {'loss': 2.4077, 'grad_norm': 0.8419890403747559, 'learning_rate': 3.056005373591637e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 849.58, 'epoch': 0.75}
- 75%|██████████████▎    | 753/1000 [4:31:49<1:18:34, 19.09s/it] 75%|██████████████▎    | 754/1000 [4:31:57<1:04:14, 15.67s/it]                                                               {'loss': 2.379, 'grad_norm': 0.7834282517433167, 'learning_rate': 3.032736083180716e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 975.04, 'epoch': 0.75}
- 75%|██████████████▎    | 754/1000 [4:31:57<1:04:14, 15.67s/it] 76%|███████████████▊     | 755/1000 [4:32:05<54:13, 13.28s/it]                                                               {'loss': 2.3742, 'grad_norm': 0.8919993042945862, 'learning_rate': 3.0095398760714267e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 833.53, 'epoch': 0.76}
- 76%|███████████████▊     | 755/1000 [4:32:05<54:13, 13.28s/it] 76%|███████████████▉     | 756/1000 [4:32:12<47:09, 11.60s/it]                                                               {'loss': 2.3042, 'grad_norm': 0.8099663853645325, 'learning_rate': 2.9864169955810084e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 886.31, 'epoch': 0.76}
- 76%|███████████████▉     | 756/1000 [4:32:12<47:09, 11.60s/it] 76%|███████████████▉     | 757/1000 [4:32:20<42:15, 10.44s/it]                                                               {'loss': 2.3162, 'grad_norm': 0.7349341511726379, 'learning_rate': 2.9633676842575387e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1106.79, 'epoch': 0.76}
- 76%|███████████████▉     | 757/1000 [4:32:20<42:15, 10.44s/it] 76%|█��█████████████▉     | 758/1000 [4:32:28<38:45,  9.61s/it]                                                               {'loss': 2.2528, 'grad_norm': 0.8883168697357178, 'learning_rate': 2.940392183877382e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 812.93, 'epoch': 0.76}
- 76%|███████████████▉     | 758/1000 [4:32:28<38:45,  9.61s/it] 76%|███████████████▉     | 759/1000 [4:32:36<36:18,  9.04s/it]                                                               {'loss': 2.5123, 'grad_norm': 0.7579997181892395, 'learning_rate': 2.9174907354426696e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1220.41, 'epoch': 0.76}
- 76%|███████████████▉     | 759/1000 [4:32:36<36:18,  9.04s/it] 76%|███████████████▉     | 760/1000 [4:32:43<34:33,  8.64s/it]                                                               {'loss': 2.3992, 'grad_norm': 1.0064071416854858, 'learning_rate': 2.8946635791787545e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 758.66, 'epoch': 0.76}
- 76%|███████████████▉     | 760/1000 [4:32:43<34:33,  8.64s/it] 76%|███████████████▉     | 761/1000 [4:32:51<33:17,  8.36s/it]                                                               {'loss': 2.6983, 'grad_norm': 0.977753221988678, 'learning_rate': 2.8719109545317103e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 944.36, 'epoch': 0.76}
- 76%|███████████████▉     | 761/1000 [4:32:51<33:17,  8.36s/it] 76%|████████████████     | 762/1000 [4:32:59<32:22,  8.16s/it]                                                               {'loss': 2.4066, 'grad_norm': 0.8830663561820984, 'learning_rate': 2.8492331001657945e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 787.33, 'epoch': 0.76}
- 76%|████████████████     | 762/1000 [4:32:59<32:22,  8.16s/it] 76%|████████████████     | 763/1000 [4:33:06<31:42,  8.03s/it]                                                               {'loss': 2.3168, 'grad_norm': 0.7640048861503601, 'learning_rate': 2.8266302539609745e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1036.02, 'epoch': 0.76}
- 76%|████████████████     | 763/1000 [4:33:06<31:42,  8.03s/it] 76%|████████████████     | 764/1000 [4:33:14<31:08,  7.92s/it]                                                               {'loss': 2.1416, 'grad_norm': 0.8457198143005371, 'learning_rate': 2.804102653010414e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 898.74, 'epoch': 0.76}
- 76%|████████████████     | 764/1000 [4:33:14<31:08,  7.92s/it] 76%|████████████████     | 765/1000 [4:33:22<30:41,  7.83s/it]                                                               {'loss': 2.4252, 'grad_norm': 0.9579863548278809, 'learning_rate': 2.7816505336179798e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 674.11, 'epoch': 0.77}
- 76%|████████████████     | 765/1000 [4:33:22<30:41,  7.83s/it] 77%|████████████████     | 766/1000 [4:33:29<30:23,  7.79s/it]                                                               {'loss': 2.4124, 'grad_norm': 1.0743871927261353, 'learning_rate': 2.759274131295787e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 612.82, 'epoch': 0.77}
- 77%|████████████████     | 766/1000 [4:33:29<30:23,  7.79s/it] 77%|████████████████     | 767/1000 [4:33:37<30:10,  7.77s/it]                                                               {'loss': 2.394, 'grad_norm': 0.8692949414253235, 'learning_rate': 2.736973680761702e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 874.64, 'epoch': 0.77}
- 77%|████████████████     | 767/1000 [4:33:37<30:10,  7.77s/it] 77%|██████���█████████▏    | 768/1000 [4:33:45<29:56,  7.74s/it]                                                               {'loss': 2.3259, 'grad_norm': 0.9385331869125366, 'learning_rate': 2.7147494159369036e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 667.38, 'epoch': 0.77}
- 77%|████████████████▏    | 768/1000 [4:33:45<29:56,  7.74s/it] 77%|████████████████▏    | 769/1000 [4:33:53<29:47,  7.74s/it]                                                               {'loss': 2.2832, 'grad_norm': 0.8399586081504822, 'learning_rate': 2.6926015699434072e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 922.5, 'epoch': 0.77}
- 77%|████████████████▏    | 769/1000 [4:33:53<29:47,  7.74s/it] 77%|████████████████▏    | 770/1000 [4:34:00<29:40,  7.74s/it]                                                               {'loss': 2.4808, 'grad_norm': 1.013923168182373, 'learning_rate': 2.6705303751016408e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1277.92, 'epoch': 0.77}
- 77%|████████████████▏    | 770/1000 [4:34:00<29:40,  7.74s/it] 77%|████████████████▏    | 771/1000 [4:34:08<29:30,  7.73s/it]                                                               {'loss': 2.2353, 'grad_norm': 0.8694955110549927, 'learning_rate': 2.6485360629279987e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 782.66, 'epoch': 0.77}
- 77%|████████████████▏    | 771/1000 [4:34:08<29:30,  7.73s/it] 77%|████████████████▏    | 772/1000 [4:34:16<29:21,  7.73s/it]                                                               {'loss': 2.4885, 'grad_norm': 0.8632628321647644, 'learning_rate': 2.6266188641323996e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 909.19, 'epoch': 0.77}
- 77%|████████████████▏    | 772/1000 [4:34:16<29:21,  7.73s/it] 77%|████████████████▏    | 773/1000 [4:34:23<29:11,  7.72s/it]                                                               {'loss': 2.4935, 'grad_norm': 0.8489196300506592, 'learning_rate': 2.6047790086158952e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 852.48, 'epoch': 0.77}
- 77%|████████████████▏    | 773/1000 [4:34:23<29:11,  7.72s/it] 77%|████████████████▎    | 774/1000 [4:34:31<29:04,  7.72s/it]                                                               {'loss': 2.2013, 'grad_norm': 0.7899218201637268, 'learning_rate': 2.5830167254682257e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 962.41, 'epoch': 0.77}
- 77%|████████████████▎    | 774/1000 [4:34:31<29:04,  7.72s/it] 78%|████████████████▎    | 775/1000 [4:34:39<28:55,  7.71s/it]                                                               {'loss': 2.2906, 'grad_norm': 0.9783218502998352, 'learning_rate': 2.5613322429654574e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 758.63, 'epoch': 0.78}
- 78%|████████████████▎    | 775/1000 [4:34:39<28:55,  7.71s/it] 78%|████████████████▎    | 776/1000 [4:34:46<28:42,  7.69s/it]                                                               {'loss': 2.4001, 'grad_norm': 0.8975478410720825, 'learning_rate': 2.5397257885675397e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 843.88, 'epoch': 0.78}
- 78%|████████████████▎    | 776/1000 [4:34:46<28:42,  7.69s/it] 78%|████████████████▎    | 777/1000 [4:34:54<28:33,  7.68s/it]                                                               {'loss': 2.2739, 'grad_norm': 1.0550146102905273, 'learning_rate': 2.5181975889159615e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 623.08, 'epoch': 0.78}
- 78%|████████████████▎    | 777/1000 [4:34:54<28:33,  7.68s/it] 78%|████████████████▎    | 778/1000 [4:35:02<28:25,  7.68s/it]                                                               {'loss': 2.5905, 'grad_norm': 1.1428221464157104, 'learning_rate': 2.496747869831345e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 546.02, 'epoch': 0.78}
- 78%|████████████████▎    | 778/1000 [4:35:02<28:25,  7.68s/it] 78%|████████████████▎    | 779/1000 [4:35:10<28:19,  7.69s/it]                                                               {'loss': 2.4074, 'grad_norm': 0.7170055508613586, 'learning_rate': 2.475376856311097e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1285.55, 'epoch': 0.78}
- 78%|████████████████▎    | 779/1000 [4:35:10<28:19,  7.69s/it] 78%|████████████████▍    | 780/1000 [4:35:17<28:11,  7.69s/it]                                                               {'loss': 2.4448, 'grad_norm': 0.8317585587501526, 'learning_rate': 2.4540847725270378e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 950.5, 'epoch': 0.78}
- 78%|████████████████▍    | 780/1000 [4:35:17<28:11,  7.69s/it] 78%|████████████████▍    | 781/1000 [4:35:25<28:04,  7.69s/it]                                                               {'loss': 2.4692, 'grad_norm': 0.9008452892303467, 'learning_rate': 2.432871841823047e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 853.4, 'epoch': 0.78}
- 78%|████████████████▍    | 781/1000 [4:35:25<28:04,  7.69s/it] 78%|████████████████▍    | 782/1000 [4:35:33<27:57,  7.70s/it]                                                               {'loss': 2.4385, 'grad_norm': 0.8514346480369568, 'learning_rate': 2.411738286712735e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1007.55, 'epoch': 0.78}
- 78%|████████████████▍    | 782/1000 [4:35:33<27:57,  7.70s/it] 78%|████████████████▍    | 783/1000 [4:35:40<27:48,  7.69s/it]                                                               {'loss': 2.3682, 'grad_norm': 0.91911381483078, 'learning_rate': 2.3906843288770886e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 720.16, 'epoch': 0.78}
- 78%|████████████████▍    | 783/1000 [4:35:40<27:48,  7.69s/it] 78%|████████████████▍    | 784/1000 [4:35:48<27:42,  7.69s/it]                                                               {'loss': 2.2046, 'grad_norm': 0.8594357967376709, 'learning_rate': 2.3697101891621697e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 807.67, 'epoch': 0.78}
- 78%|████████████████▍    | 784/1000 [4:35:48<27:42,  7.69s/it] 78%|████████████████▍    | 785/1000 [4:35:56<27:33,  7.69s/it]                                                               {'loss': 2.3264, 'grad_norm': 1.0453511476516724, 'learning_rate': 2.3488160875767717e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 544.33, 'epoch': 0.79}
- 78%|████████████████▍    | 785/1000 [4:35:56<27:33,  7.69s/it] 79%|████████████████▌    | 786/1000 [4:36:03<27:27,  7.70s/it]                                                               {'loss': 2.2037, 'grad_norm': 0.7583779096603394, 'learning_rate': 2.3280022432901383e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1053.61, 'epoch': 0.79}
- 79%|████████████████▌    | 786/1000 [4:36:03<27:27,  7.70s/it] 79%|████████████████▌    | 787/1000 [4:36:11<27:19,  7.70s/it]                                                               {'loss': 2.2276, 'grad_norm': 0.8789792060852051, 'learning_rate': 2.307268874629649e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 797.34, 'epoch': 0.79}
- 79%|████████████████▌    | 787/1000 [4:36:11<27:19,  7.70s/it] 79%|████████████████▌    | 788/1000 [4:36:19<27:07,  7.68s/it]                                                               {'loss': 2.4048, 'grad_norm': 0.9376871585845947, 'learning_rate': 2.2866161990785228e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 805.55, 'epoch': 0.79}
- 79%|████████████████▌    | 788/1000 [4:36:19<27:07,  7.68s/it] 79%|████████████████▌    | 789/1000 [4:36:26<27:01,  7.69s/it]                                                               {'loss': 2.2564, 'grad_norm': 0.7980170249938965, 'learning_rate': 2.266044433273562e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 950.81, 'epoch': 0.79}
- 79%|████████████████▌    | 789/1000 [4:36:26<27:01,  7.69s/it] 79%|████████████████▌    | 790/1000 [4:36:34<26:56,  7.70s/it]                                                               {'loss': 2.5879, 'grad_norm': 0.7848008275032043, 'learning_rate': 2.245553793002849e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1149.1, 'epoch': 0.79}
- 79%|████████████████▌    | 790/1000 [4:36:34<26:56,  7.70s/it] 79%|████████████████▌    | 791/1000 [4:36:42<26:47,  7.69s/it]                                                               {'loss': 2.5216, 'grad_norm': 0.8388023376464844, 'learning_rate': 2.2251444932035094e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1025.0, 'epoch': 0.79}
- 79%|████████████████▌    | 791/1000 [4:36:42<26:47,  7.69s/it] 79%|████████████████▋    | 792/1000 [4:36:49<26:38,  7.68s/it]                                                               {'loss': 2.2971, 'grad_norm': 0.9787597060203552, 'learning_rate': 2.204816747959434e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 577.75, 'epoch': 0.79}
- 79%|████████████████▋    | 792/1000 [4:36:49<26:38,  7.68s/it] 79%|████████████████▋    | 793/1000 [4:36:57<26:28,  7.67s/it]                                                               {'loss': 2.4412, 'grad_norm': 0.820093035697937, 'learning_rate': 2.184570770499056e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1088.28, 'epoch': 0.79}
- 79%|████████████████▋    | 793/1000 [4:36:57<26:28,  7.67s/it] 79%|████████████████▋    | 794/1000 [4:37:05<26:20,  7.67s/it]                                                               {'loss': 2.4178, 'grad_norm': 0.8776549696922302, 'learning_rate': 2.1644067731931007e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 996.77, 'epoch': 0.79}
- 79%|████████████████▋    | 794/1000 [4:37:05<26:20,  7.67s/it] 80%|████████████████▋    | 795/1000 [4:37:12<26:11,  7.66s/it]                                                               {'loss': 2.5738, 'grad_norm': 0.7985237836837769, 'learning_rate': 2.1443249675523536e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1148.12, 'epoch': 0.8}
- 80%|████████████████▋    | 795/1000 [4:37:12<26:11,  7.66s/it] 80%|████████████████▋    | 796/1000 [4:37:20<26:00,  7.65s/it]                                                               {'loss': 2.2309, 'grad_norm': 0.7557388544082642, 'learning_rate': 2.1243255642254578e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1071.5, 'epoch': 0.8}
- 80%|████████████████▋    | 796/1000 [4:37:20<26:00,  7.65s/it] 80%|████████████████▋    | 797/1000 [4:37:28<25:50,  7.64s/it]                                                               {'loss': 2.2706, 'grad_norm': 0.8294113278388977, 'learning_rate': 2.1044087729966856e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 924.77, 'epoch': 0.8}
- 80%|█████████████���██▋    | 797/1000 [4:37:28<25:50,  7.64s/it] 80%|████████████████▊    | 798/1000 [4:37:35<25:40,  7.63s/it]                                                               {'loss': 2.2625, 'grad_norm': 0.7490404844284058, 'learning_rate': 2.0845748027837586e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1025.67, 'epoch': 0.8}
- 80%|████████████████▊    | 798/1000 [4:37:35<25:40,  7.63s/it] 80%|████████████████▊    | 799/1000 [4:37:43<25:33,  7.63s/it]                                                               {'loss': 2.4494, 'grad_norm': 0.8937728404998779, 'learning_rate': 2.0648238616356332e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 825.86, 'epoch': 0.8}
- 80%|████████████████▊    | 799/1000 [4:37:43<25:33,  7.63s/it] 80%|████████████████▊    | 800/1000 [4:37:51<25:27,  7.64s/it]                                                               {'loss': 2.4695, 'grad_norm': 0.8685123324394226, 'learning_rate': 2.045156156730338e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1031.31, 'epoch': 0.8}
- 80%|████████████████▊    | 800/1000 [4:37:51<25:27,  7.64s/it][2025-10-18 23:40:37,689] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 23:40:40,193] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1963882446289062
-[2025-10-18 23:40:41,366] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1726484298706055
-[2025-10-18 23:40:42,559] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1933553218841553
-[2025-10-18 23:40:43,751] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1911194324493408
-[2025-10-18 23:40:43,751] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:27,  6.33it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.34it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.79it/s][A
-  3%|▋                         | 5/179 [00:01<01:19,  2.19it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.48it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.79it/s][A
-  5%|█▎                        | 9/179 [00:03<01:09,  2.43it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.68it/s][A
-  6%|█▌                       | 11/179 [00:03<00:59,  2.81it/s][A
-  7%|█▋                       | 12/179 [00:04<00:57,  2.90it/s][A
-  7%|█▊                       | 13/179 [00:04<01:06,  2.50it/s][A
-  8%|█▉                       | 14/179 [00:04<01:00,  2.73it/s][A
-  8%|██                       | 15/179 [00:05<00:57,  2.83it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.90it/s][A
-  9%|██▎                      | 17/179 [00:06<01:04,  2.50it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.72it/s][A
- 11%|██▋                      | 19/179 [00:06<00:56,  2.84it/s][A
- 11%|██▊                      | 20/179 [00:07<00:54,  2.92it/s][A
- 12%|██▉                      | 21/179 [00:07<01:02,  2.52it/s][A
- 12%|███                      | 22/179 [00:07<00:57,  2.73it/s][A
- 13%|███▏                     | 23/179 [00:08<00:54,  2.84it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.92it/s][A
- 14%|███▍                     | 25/179 [00:09<01:01,  2.51it/s][A
- 15%|███▋                     | 26/179 [00:09<00:55,  2.74it/s][A
- 15%|███▊                     | 27/179 [00:09<00:53,  2.84it/s][A
- 16%|███▉                     | 28/179 [00:10<00:51,  2.92it/s][A
- 16%|████                     | 29/179 [00:10<00:59,  2.51it/s][A
- 17%|████▏                    | 30/179 [00:10<00:54,  2.74it/s][A
- 17%|████▎                    | 31/179 [00:11<00:51,  2.85it/s][A
- 18%|████▍                    | 32/179 [00:11<00:50,  2.93it/s][A
- 18%|████▌                    | 33/179 [00:11<00:58,  2.51it/s][A
- 19%|████▋                    | 34/179 [00:12<00:52,  2.74it/s][A
- 20%|████▉                    | 35/179 [00:12<00:50,  2.85it/s][A
- 20%|█████                    | 36/179 [00:12<00:49,  2.92it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:56,  2.52it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:51,  2.75it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.85it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:47,  2.93it/s][A
- 23%|█████▋                   | 41/179 [00:14<00:54,  2.52it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:49,  2.75it/s][A
- 24%|██████                   | 43/179 [00:15<00:47,  2.85it/s][A
- 25%|██████▏                  | 44/179 [00:15<00:46,  2.93it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:53,  2.52it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:48,  2.75it/s][A
- 26%|██████▌                  | 47/179 [00:16<00:46,  2.85it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:44,  2.92it/s][A
- 27%|██████▊                  | 49/179 [00:17<00:51,  2.52it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.74it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.84it/s][A
- 29%|███████▎                 | 52/179 [00:18<00:43,  2.92it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:50,  2.51it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:45,  2.73it/s][A
- 31%|███████▋                 | 55/179 [00:19<00:43,  2.84it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.92it/s][A
- 32%|███████▉                 | 57/179 [00:20<00:48,  2.51it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.74it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.84it/s][A
- 34%|████████▍                | 60/179 [00:21<00:40,  2.92it/s][A
- 34%|████████▌                | 61/179 [00:22<00:46,  2.51it/s][A
- 35%|████████▋                | 62/179 [00:22<00:42,  2.73it/s][A
- 35%|████████▊                | 63/179 [00:22<00:40,  2.84it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.92it/s][A
- 36%|█████████                | 65/179 [00:23<00:45,  2.51it/s][A
- 37%|█████████▏               | 66/179 [00:23<00:41,  2.74it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:39,  2.85it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:37,  2.92it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:43,  2.51it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:39,  2.74it/s][A
- 40%|█████████▉               | 71/179 [00:25<00:37,  2.85it/s][A
- 40%|██████████               | 72/179 [00:26<00:36,  2.92it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.51it/s][A
- 41%|██████████▎              | 74/179 [00:26<00:38,  2.74it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:36,  2.85it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.93it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:40,  2.52it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:36,  2.73it/s][A
- 44%|███████████              | 79/179 [00:28<00:35,  2.84it/s][A
- 45%|███████████▏             | 80/179 [00:28<00:33,  2.91it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:38,  2.52it/s][A
- 46%|███████████▍             | 82/179 [00:29<00:35,  2.74it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:33,  2.84it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:32,  2.91it/s][A
- 47%|███████████▊             | 85/179 [00:30<00:37,  2.51it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.73it/s][A
- 49%|████████████▏            | 87/179 [00:31<00:32,  2.84it/s][A
- 49%|████████████▎            | 88/179 [00:31<00:31,  2.92it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:35,  2.52it/s][A
- 50%|████████████▌            | 90/179 [00:32<00:32,  2.74it/s][A
- 51%|████████████▋            | 91/179 [00:32<00:30,  2.84it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:29,  2.92it/s][A
- 52%|████████████▉            | 93/179 [00:33<00:34,  2.51it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.74it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:29,  2.85it/s][A
- 54%|█████████████▍           | 96/179 [00:34<00:28,  2.93it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:32,  2.52it/s][A
- 55%|█████████████▋           | 98/179 [00:35<00:29,  2.74it/s][A
- 55%|█████████████▊           | 99/179 [00:35<00:28,  2.84it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.91it/s][A
- 56%|█████████████▌          | 101/179 [00:36<00:31,  2.51it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.73it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:26,  2.84it/s][A
- 58%|█████████████▉          | 104/179 [00:37<00:25,  2.91it/s][A
- 59%|██████████████          | 105/179 [00:38<00:29,  2.50it/s][A
- 59%|██████████████▏         | 106/179 [00:38<00:26,  2.73it/s][A
- 60%|██████████████▎         | 107/179 [00:38<00:25,  2.83it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.91it/s][A
- 61%|██████████████▌         | 109/179 [00:39<00:27,  2.51it/s][A
- 61%|██████████████▋         | 110/179 [00:39<00:25,  2.74it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:23,  2.85it/s][A
- 63%|███████████████         | 112/179 [00:40<00:22,  2.92it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.51it/s][A
- 64%|███████████████▎        | 114/179 [00:41<00:23,  2.73it/s][A
- 64%|███████████████▍        | 115/179 [00:41<00:22,  2.85it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.92it/s][A
- 65%|███████████████▋        | 117/179 [00:42<00:24,  2.51it/s][A
- 66%|███████████████▊        | 118/179 [00:42<00:22,  2.73it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.84it/s][A
- 67%|████████████████        | 120/179 [00:43<00:20,  2.92it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.51it/s][A
- 68%|████████████████▎       | 122/179 [00:44<00:20,  2.74it/s][A
- 69%|████████████████▍       | 123/179 [00:44<00:19,  2.85it/s][A
- 69%|████████████████▋       | 124/179 [00:44<00:18,  2.92it/s][A
- 70%|████████████████▊       | 125/179 [00:45<00:21,  2.52it/s][A
- 70%|████████████████▉       | 126/179 [00:45<00:19,  2.74it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.85it/s][A
- 72%|█████████████████▏      | 128/179 [00:46<00:17,  2.91it/s][A
- 72%|█████████████████▎      | 129/179 [00:46<00:19,  2.51it/s][A
- 73%|█████████████████▍      | 130/179 [00:47<00:17,  2.73it/s][A
- 73%|█████████████████▌      | 131/179 [00:47<00:16,  2.84it/s][A
- 74%|█████████████████▋      | 132/179 [00:47<00:16,  2.92it/s][A
- 74%|█████████████████▊      | 133/179 [00:48<00:18,  2.51it/s][A
- 75%|█████████████████▉      | 134/179 [00:48<00:16,  2.73it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.83it/s][A
- 76%|██████████████████▏     | 136/179 [00:49<00:14,  2.90it/s][A
- 77%|██████████████████▎     | 137/179 [00:49<00:16,  2.49it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:15,  2.71it/s][A
- 78%|██████████████████▋     | 139/179 [00:50<00:14,  2.82it/s][A
- 78%|██████████████████▊     | 140/179 [00:50<00:13,  2.90it/s][A
- 79%|██████████████████▉     | 141/179 [00:51<00:15,  2.50it/s][A
- 79%|███████████████████     | 142/179 [00:51<00:13,  2.72it/s][A
- 80%|███████████████████▏    | 143/179 [00:51<00:12,  2.82it/s][A
- 80%|███████████████████▎    | 144/179 [00:52<00:12,  2.89it/s][A
- 81%|███████████████████▍    | 145/179 [00:52<00:13,  2.49it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.71it/s][A
- 82%|███████████████████▋    | 147/179 [00:53<00:11,  2.83it/s][A
- 83%|███████████████████▊    | 148/179 [00:53<00:10,  2.91it/s][A
- 83%|███████████████████▉    | 149/179 [00:54<00:11,  2.51it/s][A
- 84%|████████████████████    | 150/179 [00:54<00:10,  2.73it/s][A
- 84%|████████████████████▏   | 151/179 [00:54<00:09,  2.84it/s][A
- 85%|████████████████████▍   | 152/179 [00:55<00:09,  2.91it/s][A
- 85%|████████████████████▌   | 153/179 [00:55<00:10,  2.51it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.74it/s][A
- 87%|████████████████████▊   | 155/179 [00:56<00:08,  2.85it/s][A
- 87%|████████████████████▉   | 156/179 [00:56<00:07,  2.91it/s][A
- 88%|█████████████████████   | 157/179 [00:57<00:08,  2.50it/s][A
- 88%|█████████████████████▏  | 158/179 [00:57<00:07,  2.73it/s][A
- 89%|█████████████████████▎  | 159/179 [00:57<00:07,  2.84it/s][A
- 89%|█████████████████████▍  | 160/179 [00:58<00:06,  2.92it/s][A
- 90%|█████████████████████▌  | 161/179 [00:58<00:07,  2.51it/s][A
- 91%|█████████████████████▋  | 162/179 [00:58<00:06,  2.73it/s][A
- 91%|█████████████████████▊  | 163/179 [00:59<00:05,  2.84it/s][A
- 92%|█████████████████████▉  | 164/179 [00:59<00:05,  2.92it/s][A
- 92%|██████████████████████  | 165/179 [01:00<00:05,  2.50it/s][A
- 93%|██████████████████████▎ | 166/179 [01:00<00:04,  2.72it/s][A
- 93%|██████████████████████▍ | 167/179 [01:00<00:04,  2.83it/s][A
- 94%|██████████████████████▌ | 168/179 [01:01<00:03,  2.91it/s][A
- 94%|██████████████████████▋ | 169/179 [01:01<00:03,  2.51it/s][A
- 95%|██████████████████████▊ | 170/179 [01:01<00:03,  2.73it/s][A
- 96%|██████████████████████▉ | 171/179 [01:02<00:02,  2.83it/s][A
- 96%|███████████████████████ | 172/179 [01:02<00:02,  2.91it/s][A
- 97%|███████████████████████▏| 173/179 [01:03<00:02,  2.51it/s][A
- 97%|███████████████████████▎| 174/179 [01:03<00:01,  2.74it/s][A
- 98%|███████████████████████▍| 175/179 [01:03<00:01,  2.84it/s][A
- 98%|███████████████████████▌| 176/179 [01:03<00:01,  2.91it/s][A
- 99%|███████████████████████▋| 177/179 [01:04<00:00,  2.51it/s][A
- 99%|███████████████████████▊| 178/179 [01:04<00:00,  2.73it/s][A
-100%|████████████████████████| 179/179 [01:05<00:00,  2.65it/s][A                                                               
-                                                               [A{'eval_loss': 2.2662014961242676, 'eval_runtime': 67.1457, 'eval_samples_per_second': 2.919, 'eval_steps_per_second': 1.46, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.8}
- 80%|████████████████▊    | 800/1000 [4:39:04<25:27,  7.64s/it]
-100%|████████████████████████| 179/179 [01:05<00:00,  2.65it/s][A
-                                                               [A[2025-10-18 23:41:50,902] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-800
- 80%|███████████████▏   | 801/1000 [4:39:14<1:40:29, 30.30s/it]                                                               {'loss': 2.5475, 'grad_norm': 0.8855046033859253, 'learning_rate': 2.025571894372794e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 815.83, 'epoch': 0.8}
- 80%|███████████████▏   | 801/1000 [4:39:14<1:40:29, 30.30s/it] 80%|███████████████▏   | 802/1000 [4:39:21<1:17:31, 23.49s/it]                                                               {'loss': 2.247, 'grad_norm': 0.7530855536460876, 'learning_rate': 2.0060712799926408e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1040.02, 'epoch': 0.8}
- 80%|███████████████▏   | 802/1000 [4:39:21<1:17:31, 23.49s/it] 80%|███████████████▎   | 803/1000 [4:39:29<1:01:29, 18.73s/it]                                                               {'loss': 2.3472, 'grad_norm': 0.7473617792129517, 'learning_rate': 1.9866545181421013e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1238.13, 'epoch': 0.8}
- 80%|███████████████▎   | 803/1000 [4:39:29<1:01:29, 18.73s/it] 80%|████████████████▉    | 804/1000 [4:39:37<50:16, 15.39s/it]                                                               {'loss': 2.2072, 'grad_norm': 0.781851053237915, 'learning_rate': 1.967321812493813e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 963.0, 'epoch': 0.8}
- 80%|████████████████▉    | 804/1000 [4:39:37<50:16, 15.39s/it] 80%|████████████████▉    | 805/1000 [4:39:44<42:23, 13.05s/it]                                                               {'loss': 2.4874, 'grad_norm': 0.8798803687095642, 'learning_rate': 1.9480733658387175e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 849.45, 'epoch': 0.81}
- 80%|████████████████▉    | 805/1000 [4:39:44<42:23, 13.05s/it] 81%|████████████████▉    | 806/1000 [4:39:52<36:52, 11.41s/it]                                                               {'loss': 2.2533, 'grad_norm': 0.8362975120544434, 'learning_rate': 1.9289093800839066e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 867.44, 'epoch': 0.81}
- 81%|████████████████▉    | 806/1000 [4:39:52<36:52, 11.41s/it] 81%|████████████████▉    | 807/1000 [4:39:59<33:02, 10.27s/it]                                                               {'loss': 2.3517, 'grad_norm': 0.7957280278205872, 'learning_rate': 1.9098300562505266e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 973.36, 'epoch': 0.81}
- 81%|████████████████▉    | 807/1000 [4:39:59<33:02, 10.27s/it] 81%|████████████████▉    | 808/1000 [4:40:07<30:19,  9.47s/it]                                                               {'loss': 2.377, 'grad_norm': 0.7471276521682739, 'learning_rate': 1.8908355944716517e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1121.0, 'epoch': 0.81}
- 81%|████████████████▉    | 808/1000 [4:40:07<30:19,  9.47s/it] 81%|████████████████▉    | 809/1000 [4:40:15<28:23,  8.92s/it]                                                               {'loss': 2.2458, 'grad_norm': 0.821804404258728, 'learning_rate': 1.871926193990202e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 899.41, 'epoch': 0.81}
- 81%|████████████████▉    | 809/1000 [4:40:15<28:23,  8.92s/it] 81%|█████████████████    | 810/1000 [4:40:22<27:00,  8.53s/it]                                                               {'loss': 2.3164, 'grad_norm': 0.7253827452659607, 'learning_rate': 1.8531020531568378e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1285.94, 'epoch': 0.81}
- 81%|█████████████████    | 810/1000 [4:40:22<27:00,  8.53s/it] 81%|█████████████████    | 811/1000 [4:40:30<26:02,  8.27s/it]                                                               {'loss': 2.3205, 'grad_norm': 0.8283148407936096, 'learning_rate': 1.8343633694278895e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1041.43, 'epoch': 0.81}
- 81%|█████████████████    | 811/1000 [4:40:30<26:02,  8.27s/it] 81%|█████████████████    | 812/1000 [4:40:37<25:17,  8.07s/it]                                                               {'loss': 2.3668, 'grad_norm': 0.9045469164848328, 'learning_rate': 1.8157103393632868e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 974.44, 'epoch': 0.81}
- 81%|█████████████████    | 812/1000 [4:40:37<25:17,  8.07s/it] 81%|█████████████████    | 813/1000 [4:40:45<24:42,  7.93s/it]                                                               {'loss': 2.3236, 'grad_norm': 0.8439998030662537, 'learning_rate': 1.7971431586244815e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 849.34, 'epoch': 0.81}
- 81%|█████████████████    | 813/1000 [4:40:45<24:42,  7.93s/it] 81%|█████████████████    | 814/1000 [4:40:53<24:16,  7.83s/it]                                                               {'loss': 2.3213, 'grad_norm': 0.8810031414031982, 'learning_rate': 1.7786620219724204e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 813.5, 'epoch': 0.81}
- 81%|█████████████████    | 814/1000 [4:40:53<24:16,  7.83s/it] 82%|█████████████████    | 815/1000 [4:41:00<23:56,  7.76s/it]                                                               {'loss': 2.2608, 'grad_norm': 0.8110799193382263, 'learning_rate': 1.7602671232654754e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1028.09, 'epoch': 0.81}
- 82%|█████████████████    | 815/1000 [4:41:00<23:56,  7.76s/it] 82%|█████████████████▏   | 816/1000 [4:41:08<23:39,  7.71s/it]                                                               {'loss': 2.1649, 'grad_norm': 0.7994750142097473, 'learning_rate': 1.741958655457436e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1055.67, 'epoch': 0.82}
- 82%|█████████████████▏   | 816/1000 [4:41:08<23:39,  7.71s/it] 82%|█████████████████▏   | 817/1000 [4:41:15<23:26,  7.69s/it]                                                               {'loss': 2.5244, 'grad_norm': 0.8477999567985535, 'learning_rate': 1.723736810595461e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1066.53, 'epoch': 0.82}
- 82%|█████████████████▏   | 817/1000 [4:41:15<23:26,  7.69s/it] 82%|█████████████████▏   | 818/1000 [4:41:23<23:17,  7.68s/it]                                                               {'loss': 2.3112, 'grad_norm': 0.8432174921035767, 'learning_rate': 1.7056017798180824e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 935.21, 'epoch': 0.82}
- 82%|█████████████████▏   | 818/1000 [4:41:23<23:17,  7.68s/it] 82%|█████████████████▏   | 819/1000 [4:41:31<23:06,  7.66s/it]                                                               {'loss': 2.3145, 'grad_norm': 0.833696722984314, 'learning_rate': 1.6875537533531948e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 952.47, 'epoch': 0.82}
- 82%|█████████████████▏   | 819/1000 [4:41:31<23:06,  7.66s/it] 82%|█████████████████▏   | 820/1000 [4:41:38<22:58,  7.66s/it]                                                               {'loss': 2.287, 'grad_norm': 1.887503981590271, 'learning_rate': 1.6695929205160487e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1095.31, 'epoch': 0.82}
- 82%|█████████████████▏   | 820/1000 [4:41:38<22:58,  7.66s/it] 82%|█████████████████▏   | 821/1000 [4:41:46<22:49,  7.65s/it]                                                               {'loss': 2.2198, 'grad_norm': 0.9876425266265869, 'learning_rate': 1.65171946970729e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 671.33, 'epoch': 0.82}
- 82%|█████████████████▏   | 821/1000 [4:41:46<22:49,  7.65s/it] 82%|█████████████████▎   | 822/1000 [4:41:54<22:40,  7.64s/it]                                                               {'loss': 2.3117, 'grad_norm': 0.8462584018707275, 'learning_rate': 1.6339335884109518e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 884.97, 'epoch': 0.82}
- 82%|█████████████████▎   | 822/1000 [4:41:54<22:40,  7.64s/it] 82%|█████████████████▎   | 823/1000 [4:42:01<22:32,  7.64s/it]                                                               {'loss': 2.4947, 'grad_norm': 0.7991177439689636, 'learning_rate': 1.6162354631925204e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1022.07, 'epoch': 0.82}
- 82%|█████████████████▎   | 823/1000 [4:42:01<22:32,  7.64s/it] 82%|█████████████████▎   | 824/1000 [4:42:09<22:24,  7.64s/it]                                                               {'loss': 2.2911, 'grad_norm': 1.0462646484375, 'learning_rate': 1.598625279696948e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 659.78, 'epoch': 0.82}
- 82%|█████████████████▎   | 824/1000 [4:42:09<22:24,  7.64s/it] 82%|█████████████████▎   | 825/1000 [4:42:17<22:15,  7.63s/it]                                                               {'loss': 2.2368, 'grad_norm': 0.9889011979103088, 'learning_rate': 1.5811032226467305e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 670.55, 'epoch': 0.82}
- 82%|█████████████████▎   | 825/1000 [4:42:17<22:15,  7.63s/it] 83%|█████████████████▎   | 826/1000 [4:42:24<22:07,  7.63s/it]                                                               {'loss': 2.2934, 'grad_norm': 0.7792747020721436, 'learning_rate': 1.563669475839956e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1174.65, 'epoch': 0.83}
- 83%|█████████████████▎   | 826/1000 [4:42:24<22:07,  7.63s/it] 83%|█████████████████▎   | 827/1000 [4:42:32<21:58,  7.62s/it]                                                               {'loss': 2.2096, 'grad_norm': 0.9597637057304382, 'learning_rate': 1.5463242221483743e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 694.64, 'epoch': 0.83}
- 83%|█████████████████▎   | 827/1000 [4:42:32<21:58,  7.62s/it] 83%|█████████████████▍   | 828/1000 [4:42:39<21:51,  7.62s/it]                                                               {'loss': 2.3225, 'grad_norm': 0.7568403482437134, 'learning_rate': 1.529067643515495e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1183.11, 'epoch': 0.83}
- 83%|█████████████████▍   | 828/1000 [4:42:39<21:51,  7.62s/it] 83%|█████████████████▍   | 829/1000 [4:42:47<21:44,  7.63s/it]                                                               {'loss': 2.2027, 'grad_norm': 0.9818094372749329, 'learning_rate': 1.5118999209546559e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 693.84, 'epoch': 0.83}
- 83%|█████████████████▍   | 829/1000 [4:42:47<21:44,  7.63s/it] 83%|█████████████████▍   | 830/1000 [4:42:55<21:37,  7.63s/it]                                                               {'loss': 2.2435, 'grad_norm': 0.8035482168197632, 'learning_rate': 1.4948212345471491e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 989.84, 'epoch': 0.83}
- 83%|█████████████████▍   | 830/1000 [4:42:55<21:37,  7.63s/it] 83%|█████████████████▍   | 831/1000 [4:43:02<21:30,  7.63s/it]                                                               {'loss': 2.5088, 'grad_norm': 0.8237957954406738, 'learning_rate': 1.4778317634403083e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1017.82, 'epoch': 0.83}
- 83%|█████████████████▍   | 831/1000 [4:43:02<21:30,  7.63s/it] 83%|█████████████████▍   | 832/1000 [4:43:10<21:23,  7.64s/it]                                                               {'loss': 2.5079, 'grad_norm': 0.7889134287834167, 'learning_rate': 1.460931685845649e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1414.35, 'epoch': 0.83}
- 83%|█████████████████▍   | 832/1000 [4:43:10<21:23,  7.64s/it] 83%|█████████████████▍   | 833/1000 [4:43:18<21:16,  7.64s/it]                                                               {'loss': 2.3467, 'grad_norm': 0.9610748887062073, 'learning_rate': 1.444121179036989e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 908.95, 'epoch': 0.83}
- 83%|█████████████████▍   | 833/1000 [4:43:18<21:16,  7.64s/it] 83%|█████████████████▌   | 834/1000 [4:43:25<21:07,  7.64s/it]                                                               {'loss': 2.3696, 'grad_norm': 1.4238123893737793, 'learning_rate': 1.427400419348588e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 371.37, 'epoch': 0.83}
- 83%|█████████████████▌   | 834/1000 [4:43:25<21:07,  7.64s/it] 84%|█████████████████▌   | 835/1000 [4:43:33<21:00,  7.64s/it]                                                               {'loss': 2.5734, 'grad_norm': 0.9208681583404541, 'learning_rate': 1.4107695821733025e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 936.8, 'epoch': 0.83}
- 84%|█████████████████▌   | 835/1000 [4:43:33<21:00,  7.64s/it] 84%|█████████████████▌   | 836/1000 [4:43:41<20:51,  7.63s/it]                                                               {'loss': 2.1561, 'grad_norm': 1.0679666996002197, 'learning_rate': 1.3942288419607475e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 553.65, 'epoch': 0.84}
- 84%|█████████████████▌   | 836/1000 [4:43:41<20:51,  7.63s/it] 84%|█████████████████▌   | 837/1000 [4:43:48<20:45,  7.64s/it]                                                               {'loss': 2.3452, 'grad_norm': 0.7555798888206482, 'learning_rate': 1.3777783722154603e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1197.18, 'epoch': 0.84}
- 84%|█████████████████▌   | 837/1000 [4:43:48<20:45,  7.64s/it] 84%|█████████████████▌   | 838/1000 [4:43:56<20:38,  7.64s/it]                                                               {'loss': 2.2581, 'grad_norm': 0.9034664630889893, 'learning_rate': 1.3614183454950824e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 788.46, 'epoch': 0.84}
- 84%|█████████████████▌   | 838/1000 [4:43:56<20:38,  7.64s/it] 84%|█████████████████▌   | 839/1000 [4:44:03<20:30,  7.64s/it]                                                               {'loss': 2.5383, 'grad_norm': 1.0204998254776, 'learning_rate': 1.3451489334085554e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 707.82, 'epoch': 0.84}
- 84%|█████████████████▌   | 839/1000 [4:44:03<20:30,  7.64s/it] 84%|█████████████████▋   | 840/1000 [4:44:11<20:23,  7.65s/it]                                                               {'loss': 2.2092, 'grad_norm': 0.8805344104766846, 'learning_rate': 1.3289703066143111e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 895.52, 'epoch': 0.84}
- 84%|█████████████████▋   | 840/1000 [4:44:11<20:23,  7.65s/it] 84%|█████████████████▋   | 841/1000 [4:44:19<20:15,  7.65s/it]                                                               {'loss': 2.2734, 'grad_norm': 0.7917156219482422, 'learning_rate': 1.3128826348184887e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1117.49, 'epoch': 0.84}
- 84%|█████████████████▋   | 841/1000 [4:44:19<20:15,  7.65s/it] 84%|█████████████████▋   | 842/1000 [4:44:26<20:07,  7.64s/it]                                                               {'loss': 2.2599, 'grad_norm': 0.7989560961723328, 'learning_rate': 1.2968860867731569e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1080.95, 'epoch': 0.84}
- 84%|█████████████████▋   | 842/1000 [4:44:26<20:07,  7.64s/it] 84%|█████████████████▋   | 843/1000 [4:44:34<19:58,  7.63s/it]                                                               {'loss': 2.4003, 'grad_norm': 1.0275263786315918, 'learning_rate': 1.2809808302745297e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 647.14, 'epoch': 0.84}
- 84%|█████████████████▋   | 843/1000 [4:44:34<19:58,  7.63s/it] 84%|█████████████████▋   | 844/1000 [4:44:42<19:50,  7.63s/it]                                                               {'loss': 2.3359, 'grad_norm': 0.9500201940536499, 'learning_rate': 1.2651670321612263e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 759.69, 'epoch': 0.84}
- 84%|█████████████████▋   | 844/1000 [4:44:42<19:50,  7.63s/it] 84%|█████████████████▋   | 845/1000 [4:44:49<19:41,  7.62s/it]                                                               {'loss': 2.5258, 'grad_norm': 0.884848415851593, 'learning_rate': 1.2494448583125018e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 865.04, 'epoch': 0.84}
- 84%|█████████████████▋   | 845/1000 [4:44:49<19:41,  7.62s/it] 85%|█████████████████▊   | 846/1000 [4:44:57<19:33,  7.62s/it]                                                               {'loss': 2.3368, 'grad_norm': 0.7702264189720154, 'learning_rate': 1.233814473646524e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1138.56, 'epoch': 0.85}
- 85%|█████████████████▊   | 846/1000 [4:44:57<19:33,  7.62s/it] 85%|█████████████████▊   | 847/1000 [4:45:04<19:25,  7.62s/it]                                                               {'loss': 2.2957, 'grad_norm': 0.7915887236595154, 'learning_rate': 1.218276042118629e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1026.25, 'epoch': 0.85}
- 85%|█████████████████▊   | 847/1000 [4:45:04<19:25,  7.62s/it] 85%|█████████████████▊   | 848/1000 [4:45:12<19:18,  7.62s/it]                                                               {'loss': 2.3149, 'grad_norm': 0.8546216487884521, 'learning_rate': 1.202829726719611e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1020.82, 'epoch': 0.85}
- 85%|█████████████████▊   | 848/1000 [4:45:12<19:18,  7.62s/it] 85%|█████████████████▊   | 849/1000 [4:45:20<19:12,  7.63s/it]                                                               {'loss': 2.3966, 'grad_norm': 0.9208575487136841, 'learning_rate': 1.1874756894740135e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 820.25, 'epoch': 0.85}
- 85%|█████████████████▊   | 849/1000 [4:45:20<19:12,  7.63s/it] 85%|█████████████████▊   | 850/1000 [4:45:27<19:04,  7.63s/it]                                                               {'loss': 2.5086, 'grad_norm': 0.7690078020095825, 'learning_rate': 1.172214091438416e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1081.81, 'epoch': 0.85}
- 85%|█████████████████▊   | 850/1000 [4:45:27<19:04,  7.63s/it][2025-10-18 23:48:14,515] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-18 23:48:17,009] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.192716360092163
-[2025-10-18 23:48:18,222] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.211850881576538
-[2025-10-18 23:48:19,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.199242353439331
-[2025-10-18 23:48:20,618] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.1965422630310059
-[2025-10-18 23:48:20,618] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:29,  6.10it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.37it/s][A
-  2%|▌                         | 4/179 [00:00<00:45,  3.81it/s][A
-  3%|▋                         | 5/179 [00:01<01:17,  2.24it/s][A
-  3%|▊                         | 6/179 [00:02<01:08,  2.54it/s][A
-  4%|█                         | 7/179 [00:02<01:03,  2.69it/s][A
-  4%|█▏                        | 8/179 [00:02<01:00,  2.82it/s][A
-  5%|█▎                        | 9/179 [00:03<01:09,  2.45it/s][A
-  6%|█▍                       | 10/179 [00:03<01:02,  2.69it/s][A
-  6%|█▌                       | 11/179 [00:03<00:59,  2.81it/s][A
-  7%|█▋                       | 12/179 [00:04<00:57,  2.90it/s][A
-  7%|█▊                       | 13/179 [00:04<01:06,  2.49it/s][A
-  8%|█▉                       | 14/179 [00:04<01:00,  2.72it/s][A
-  8%|██                       | 15/179 [00:05<00:57,  2.83it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.91it/s][A
-  9%|██▎                      | 17/179 [00:06<01:04,  2.50it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.72it/s][A
- 11%|██▋                      | 19/179 [00:06<00:56,  2.84it/s][A
- 11%|██▊                      | 20/179 [00:07<00:54,  2.92it/s][A
- 12%|██▉                      | 21/179 [00:07<01:02,  2.51it/s][A
- 12%|███                      | 22/179 [00:07<00:57,  2.73it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.84it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.92it/s][A
- 14%|███▍                     | 25/179 [00:09<01:01,  2.51it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.73it/s][A
- 15%|███▊                     | 27/179 [00:09<00:53,  2.83it/s][A
- 16%|███▉                     | 28/179 [00:09<00:51,  2.91it/s][A
- 16%|████                     | 29/179 [00:10<00:59,  2.51it/s][A
- 17%|████▏                    | 30/179 [00:10<00:54,  2.74it/s][A
- 17%|████▎                    | 31/179 [00:11<00:52,  2.84it/s][A
- 18%|████▍                    | 32/179 [00:11<00:50,  2.92it/s][A
- 18%|████▌                    | 33/179 [00:11<00:58,  2.51it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.73it/s][A
- 20%|████▉                    | 35/179 [00:12<00:50,  2.84it/s][A
- 20%|█████                    | 36/179 [00:12<00:49,  2.92it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:56,  2.51it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:51,  2.74it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.84it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:47,  2.92it/s][A
- 23%|█████▋                   | 41/179 [00:15<01:00,  2.28it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:53,  2.54it/s][A
- 24%|██████                   | 43/179 [00:15<00:50,  2.69it/s][A
- 25%|██████▏                  | 44/179 [00:15<00:48,  2.81it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.45it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.69it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:46,  2.81it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.89it/s][A
- 27%|██████▊                  | 49/179 [00:17<00:52,  2.50it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.72it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.83it/s][A
- 29%|███████▎                 | 52/179 [00:18<00:43,  2.90it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:50,  2.50it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:45,  2.72it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:43,  2.84it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.91it/s][A
- 32%|███████▉                 | 57/179 [00:20<00:48,  2.51it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.73it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.84it/s][A
- 34%|████████▍                | 60/179 [00:21<00:40,  2.91it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.50it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.72it/s][A
- 35%|████████▊                | 63/179 [00:22<00:40,  2.83it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.92it/s][A
- 36%|█████████                | 65/179 [00:23<00:45,  2.50it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.73it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:39,  2.84it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.91it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:43,  2.51it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:39,  2.73it/s][A
- 40%|█████████▉               | 71/179 [00:25<00:38,  2.84it/s][A
- 40%|██████████               | 72/179 [00:26<00:36,  2.91it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.51it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.73it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:36,  2.84it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.92it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:40,  2.51it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.73it/s][A
- 44%|███████████              | 79/179 [00:28<00:35,  2.83it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.91it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.51it/s][A
- 46%|███████████▍             | 82/179 [00:29<00:35,  2.73it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:33,  2.83it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:32,  2.91it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:37,  2.50it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.73it/s][A
- 49%|████████████▏            | 87/179 [00:31<00:32,  2.84it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.92it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:35,  2.51it/s][A
- 50%|████████████▌            | 90/179 [00:32<00:32,  2.74it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.83it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:29,  2.91it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.51it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.74it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:29,  2.85it/s][A
- 54%|█████████████▍           | 96/179 [00:34<00:28,  2.92it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:32,  2.51it/s][A
- 55%|█████████████▋           | 98/179 [00:35<00:29,  2.73it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.83it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.91it/s][A
- 56%|█████████████▌          | 101/179 [00:36<00:31,  2.51it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.73it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:26,  2.84it/s][A
- 58%|█████████████▉          | 104/179 [00:37<00:25,  2.90it/s][A
- 59%|██████████████          | 105/179 [00:38<00:29,  2.50it/s][A
- 59%|██████████████▏         | 106/179 [00:38<00:26,  2.73it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.83it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.91it/s][A
- 61%|██████████████▌         | 109/179 [00:39<00:27,  2.51it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.73it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:23,  2.84it/s][A
- 63%|███████████████         | 112/179 [00:40<00:22,  2.92it/s][A
- 63%|████████████���██▏        | 113/179 [00:41<00:26,  2.50it/s][A
- 64%|███████████████▎        | 114/179 [00:41<00:23,  2.73it/s][A
- 64%|███████████████▍        | 115/179 [00:41<00:22,  2.84it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.90it/s][A
- 65%|███████████████▋        | 117/179 [00:42<00:24,  2.50it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.72it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.83it/s][A
- 67%|████████████████        | 120/179 [00:43<00:20,  2.91it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.51it/s][A
- 68%|████████████████▎       | 122/179 [00:44<00:20,  2.74it/s][A
- 69%|████████████████▍       | 123/179 [00:44<00:19,  2.84it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:18,  2.91it/s][A
- 70%|████████████████▊       | 125/179 [00:45<00:21,  2.51it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.73it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.84it/s][A
- 72%|█████████████████▏      | 128/179 [00:46<00:17,  2.91it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:19,  2.50it/s][A
- 73%|█████████████████▍      | 130/179 [00:47<00:18,  2.72it/s][A
- 73%|█████████████████▌      | 131/179 [00:47<00:16,  2.83it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.91it/s][A
- 74%|█████████████████▊      | 133/179 [00:48<00:18,  2.50it/s][A
- 75%|█████████████████▉      | 134/179 [00:48<00:16,  2.72it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.82it/s][A
- 76%|██████████████████▏     | 136/179 [00:49<00:14,  2.90it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:16,  2.50it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:15,  2.73it/s][A
- 78%|██████████████████▋     | 139/179 [00:50<00:14,  2.83it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.90it/s][A
- 79%|██████████████████▉     | 141/179 [00:51<00:15,  2.50it/s][A
- 79%|███████████████████     | 142/179 [00:51<00:13,  2.72it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.82it/s][A
- 80%|███████████████████▎    | 144/179 [00:52<00:12,  2.90it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.49it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.71it/s][A
- 82%|███████████████████▋    | 147/179 [00:53<00:11,  2.82it/s][A
- 83%|███████████████████▊    | 148/179 [00:53<00:10,  2.91it/s][A
- 83%|███████████████████▉    | 149/179 [00:54<00:11,  2.51it/s][A
- 84%|████████████████████    | 150/179 [00:54<00:10,  2.73it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:09,  2.83it/s][A
- 85%|████████████████████▍   | 152/179 [00:55<00:09,  2.91it/s][A
- 85%|████████████████████▌   | 153/179 [00:55<00:10,  2.51it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.73it/s][A
- 87%|████████████████████▊   | 155/179 [00:56<00:08,  2.84it/s][A
- 87%|████████████████████▉   | 156/179 [00:56<00:07,  2.91it/s][A
- 88%|█████████████████████   | 157/179 [00:57<00:08,  2.51it/s][A
- 88%|█████████████████████▏  | 158/179 [00:57<00:07,  2.73it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.84it/s][A
- 89%|█████████████████████▍  | 160/179 [00:58<00:06,  2.92it/s][A
- 90%|█████████████████████▌  | 161/179 [00:58<00:07,  2.51it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.73it/s][A
- 91%|█████████████████████▊  | 163/179 [00:59<00:05,  2.84it/s][A
- 92%|█████████████████████▉  | 164/179 [00:59<00:05,  2.92it/s][A
- 92%|██████████████████████  | 165/179 [01:00<00:05,  2.51it/s][A
- 93%|██████████████████████▎ | 166/179 [01:00<00:04,  2.72it/s][A
- 93%|██████████████████████▍ | 167/179 [01:00<00:04,  2.83it/s][A
- 94%|██████████████████████▌ | 168/179 [01:01<00:03,  2.91it/s][A
- 94%|██████████████████████▋ | 169/179 [01:01<00:03,  2.51it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.74it/s][A
- 96%|██████████████████████▉ | 171/179 [01:02<00:02,  2.84it/s][A
- 96%|███████████████████████ | 172/179 [01:02<00:02,  2.91it/s][A
- 97%|███████████████████████▏| 173/179 [01:03<00:02,  2.51it/s][A
- 97%|███████████████████████▎| 174/179 [01:03<00:01,  2.73it/s][A
- 98%|███████████████████████▍| 175/179 [01:03<00:01,  2.84it/s][A
- 98%|███████████████████████▌| 176/179 [01:04<00:01,  2.91it/s][A
- 99%|███████████████████████▋| 177/179 [01:04<00:00,  2.51it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.73it/s][A
-100%|████████████████████████| 179/179 [01:05<00:00,  2.64it/s][A                                                               
-                                                               [A{'eval_loss': 2.255345344543457, 'eval_runtime': 67.5445, 'eval_samples_per_second': 2.902, 'eval_steps_per_second': 1.451, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.85}
- 85%|█████████████████▊   | 850/1000 [4:46:41<19:04,  7.63s/it]
-100%|████████████████████████| 179/179 [01:05<00:00,  2.64it/s][A
-                                                               [A[2025-10-18 23:49:28,168] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-850
- 85%|████████████████▏  | 851/1000 [4:46:51<1:15:31, 30.41s/it]                                                               {'loss': 2.5343, 'grad_norm': 0.9348514080047607, 'learning_rate': 1.1570450926997655e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 777.77, 'epoch': 0.85}
- 85%|████████████████▏  | 851/1000 [4:46:51<1:15:31, 30.41s/it] 85%|█████████████████▉   | 852/1000 [4:46:59<58:07, 23.57s/it]                                                               {'loss': 2.406, 'grad_norm': 0.7999914884567261, 'learning_rate': 1.141968852373676e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1110.95, 'epoch': 0.85}
- 85%|█████████████████▉   | 852/1000 [4:46:59<58:07, 23.57s/it] 85%|█████████████████▉   | 853/1000 [4:47:06<46:00, 18.78s/it]                                                               {'loss': 2.3516, 'grad_norm': 0.8471423983573914, 'learning_rate': 1.1269855286027797e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 895.28, 'epoch': 0.85}
- 85%|█████████████████▉   | 853/1000 [4:47:06<46:00, 18.78s/it] 85%|█████████████████▉   | 854/1000 [4:47:14<37:32, 15.43s/it]                                                               {'loss': 2.2398, 'grad_norm': 0.7942390441894531, 'learning_rate': 1.1120952785550476e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 959.39, 'epoch': 0.85}
- 85%|█████████████████▉   | 854/1000 [4:47:14<37:32, 15.43s/it] 86%|█████████████████▉   | 855/1000 [4:47:21<31:37, 13.08s/it]                                                               {'loss': 2.4515, 'grad_norm': 0.9742594957351685, 'learning_rate': 1.0972982584221592e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 711.68, 'epoch': 0.85}
- 86%|█████████████████▉   | 855/1000 [4:47:21<31:37, 13.08s/it] 86%|█████████████████▉   | 856/1000 [4:47:29<27:27, 11.44s/it]                                                               {'loss': 2.41, 'grad_norm': 0.7432851791381836, 'learning_rate': 1.0825946234178574e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1232.15, 'epoch': 0.86}
- 86%|█████████████████▉   | 856/1000 [4:47:29<27:27, 11.44s/it] 86%|█████████████████▉   | 857/1000 [4:47:37<24:30, 10.29s/it]                                                               {'loss': 2.2582, 'grad_norm': 0.8477596640586853, 'learning_rate': 1.067984527776309e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 900.24, 'epoch': 0.86}
- 86%|█████████████████▉   | 857/1000 [4:47:37<24:30, 10.29s/it] 86%|██████████████████   | 858/1000 [4:47:44<22:25,  9.48s/it]                                                               {'loss': 2.527, 'grad_norm': 1.04975163936615, 'learning_rate': 1.0534681247505106e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 704.6, 'epoch': 0.86}
- 86%|██████████████████   | 858/1000 [4:47:44<22:25,  9.48s/it] 86%|██████████████████   | 859/1000 [4:47:52<20:58,  8.93s/it]                                                               {'loss': 2.1511, 'grad_norm': 0.8263856768608093, 'learning_rate': 1.0390455666106547e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 837.66, 'epoch': 0.86}
- 86%|██████████████████   | 859/1000 [4:47:52<20:58,  8.93s/it] 86%|██████████████████   | 860/1000 [4:47:59<19:55,  8.54s/it]                                                               {'loss': 2.1835, 'grad_norm': 0.8831968307495117, 'learning_rate': 1.024717004642557e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 861.04, 'epoch': 0.86}
- 86%|██████████████████   | 860/1000 [4:47:59<19:55,  8.54s/it] 86%|██████████████████   | 861/1000 [4:48:07<19:09,  8.27s/it]                                                               {'loss': 2.5053, 'grad_norm': 0.8195421695709229, 'learning_rate': 1.010482589146048e-05, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1167.73, 'epoch': 0.86}
- 86%|██████████████████   | 861/1000 [4:48:07<19:09,  8.27s/it] 86%|██████████████████   | 862/1000 [4:48:15<18:34,  8.08s/it]                                                               {'loss': 2.247, 'grad_norm': 0.8778640627861023, 'learning_rate': 9.963424694334122e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 805.45, 'epoch': 0.86}
- 86%|██████████████████   | 862/1000 [4:48:15<18:34,  8.08s/it] 86%|██████████████████   | 863/1000 [4:48:22<18:08,  7.94s/it]                                                               {'loss': 2.4341, 'grad_norm': 0.8047611713409424, 'learning_rate': 9.822967938278171e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1001.86, 'epoch': 0.86}
- 86%|██████████████████   | 863/1000 [4:48:22<18:08,  7.94s/it] 86%|██████████████████▏  | 864/1000 [4:48:30<17:48,  7.86s/it]                                                               {'loss': 2.2369, 'grad_norm': 0.8589877486228943, 'learning_rate': 9.683457096617488e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 910.62, 'epoch': 0.86}
- 86%|██████████████████▏  | 864/1000 [4:48:30<17:48,  7.86s/it] 86%|██████████████████▏  | 865/1000 [4:48:38<17:33,  7.81s/it]                                                               {'loss': 2.6835, 'grad_norm': 0.8734748959541321, 'learning_rate': 9.544893632754814e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1007.69, 'epoch': 0.86}
- 86%|██████████████████▏  | 865/1000 [4:48:38<17:33,  7.81s/it] 87%|██████████████████▏  | 866/1000 [4:48:45<17:19,  7.76s/it]                                                               {'loss': 2.2365, 'grad_norm': 0.863456666469574, 'learning_rate': 9.407279000155312e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 876.35, 'epoch': 0.87}
- 87%|██████████████████▏  | 866/1000 [4:48:45<17:19,  7.76s/it] 87%|██████████████████▏  | 867/1000 [4:48:53<17:07,  7.73s/it]                                                               {'loss': 2.3297, 'grad_norm': 0.9424015879631042, 'learning_rate': 9.270614642331376e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 817.28, 'epoch': 0.87}
- 87%|██████████████████▏  | 867/1000 [4:48:53<17:07,  7.73s/it] 87%|██████████████████▏  | 868/1000 [4:49:01<16:59,  7.72s/it]                                                               {'loss': 2.4176, 'grad_norm': 0.9516196846961975, 'learning_rate': 9.134901992827427e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 762.35, 'epoch': 0.87}
- 87%|██████████████████▏  | 868/1000 [4:49:01<16:59,  7.72s/it] 87%|██████████████████▏  | 869/1000 [4:49:08<16:51,  7.72s/it]                                                               {'loss': 2.312, 'grad_norm': 0.8385700583457947, 'learning_rate': 9.000142475204964e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 901.15, 'epoch': 0.87}
- 87%|██████████████████▏  | 869/1000 [4:49:08<16:51,  7.72s/it] 87%|██████████████████▎  | 870/1000 [4:49:16<16:42,  7.71s/it]                                                               {'loss': 2.4659, 'grad_norm': 0.7700273394584656, 'learning_rate': 8.866337503027522e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1289.28, 'epoch': 0.87}
- 87%|██████████████████▎  | 870/1000 [4:49:16<16:42,  7.71s/it] 87%|██████████████████▎  | 871/1000 [4:49:24<16:34,  7.71s/it]                                                               {'loss': 2.256, 'grad_norm': 1.0232266187667847, 'learning_rate': 8.733488479845997e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 657.33, 'epoch': 0.87}
- 87%|██████████████████▎  | 871/1000 [4:49:24<16:34,  7.71s/it] 87%|██████████████████▎  | 872/1000 [4:49:32<16:26,  7.71s/it]                                                               {'loss': 2.1042, 'grad_norm': 0.981103777885437, 'learning_rate': 8.60159679918372e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 692.84, 'epoch': 0.87}
- 87%|██████████████████▎  | 872/1000 [4:49:32<16:26,  7.71s/it] 87%|██████████████████▎  | 873/1000 [4:49:39<16:19,  7.71s/it]                                                               {'loss': 2.5083, 'grad_norm': 0.9719055891036987, 'learning_rate': 8.470663844522052e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 780.98, 'epoch': 0.87}
- 87%|██████████████████▎  | 873/1000 [4:49:39<16:19,  7.71s/it] 87%|██████████████████▎  | 874/1000 [4:49:47<16:10,  7.70s/it]                                                               {'loss': 2.2357, 'grad_norm': 0.7637308239936829, 'learning_rate': 8.340690989285726e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1054.7, 'epoch': 0.87}
- 87%|█████��████████████▎  | 874/1000 [4:49:47<16:10,  7.70s/it] 88%|██████████████████▍  | 875/1000 [4:49:55<16:02,  7.70s/it]                                                               {'loss': 2.4938, 'grad_norm': 0.8872864246368408, 'learning_rate': 8.21167959682848e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 937.2, 'epoch': 0.88}
- 88%|██████████████████▍  | 875/1000 [4:49:55<16:02,  7.70s/it] 88%|██████████████████▍  | 876/1000 [4:50:02<15:54,  7.69s/it]                                                               {'loss': 2.1207, 'grad_norm': 0.8651183843612671, 'learning_rate': 8.083631020418791e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 847.4, 'epoch': 0.88}
- 88%|██████████████████▍  | 876/1000 [4:50:02<15:54,  7.69s/it] 88%|██████████████████▍  | 877/1000 [4:50:10<15:46,  7.70s/it]                                                               {'loss': 2.5265, 'grad_norm': 0.8207940459251404, 'learning_rate': 7.956546603225601e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1030.08, 'epoch': 0.88}
- 88%|██████████████████▍  | 877/1000 [4:50:10<15:46,  7.70s/it] 88%|██████████████████▍  | 878/1000 [4:50:18<15:39,  7.70s/it]                                                               {'loss': 2.2324, 'grad_norm': 0.8319547772407532, 'learning_rate': 7.830427678304353e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1177.37, 'epoch': 0.88}
- 88%|██████████████████▍  | 878/1000 [4:50:18<15:39,  7.70s/it] 88%|██████████████████▍  | 879/1000 [4:50:25<15:32,  7.70s/it]                                                               {'loss': 2.2587, 'grad_norm': 0.8311581015586853, 'learning_rate': 7.705275568582848e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 943.96, 'epoch': 0.88}
- 88%|██████████████████▍  | 879/1000 [4:50:25<15:32,  7.70s/it][2025-10-18 23:53:31,831] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42528] Loading dataset: AiAF/conversations with base_type: chat_template and prompt_style: None
-[2025-10-18 23:53:31,832] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:42528] Using chat template:
----
-{{ bos_token }}
-{% for m in messages %}
-  {% set role = 'model' if m['role']=='assistant' else 'user' %}
-  {{ '<start_of_turn>' + role + '\n' + m['content'] | trim + '<end_of_turn>\n' }}
-{% endfor %}
-{% if add_generation_prompt %}
-{{ '<start_of_turn>model\n' }}
-{% endif %}
-
----
-
-Tokenizing Prompts (num_proc=12):   0%| | 0/10000 [00:00<?, ? e[A
-Tokenizing Prompts (num_proc=12):   8%| | 833/10000 [24:46<4:32[A
-Tokenizing Prompts (num_proc=12):  17%|▏| 1667/10000 [25:13<1:4[A
-Tokenizing Prompts (num_proc=12):  25%|▎| 2500/10000 [25:34<52:[A
-Tokenizing Prompts (num_proc=12):  33%|▎| 3333/10000 [26:30<31:[A
-Tokenizing Prompts (num_proc=12):  42%|▍| 4166/10000 [26:54<18:[A
-Tokenizing Prompts (num_proc=12):  50%|▍| 4999/10000 [27:54<12:[A
-Tokenizing Prompts (num_proc=12):  58%|▌| 5833/10000 [28:04<07:[A
-Tokenizing Prompts (num_proc=12):  67%|▋| 6667/10000 [29:27<05:[A
-Tokenizing Prompts (num_proc=12):  75%|▊| 7500/10000 [29:28<02:[A
-Tokenizing Prompts (num_proc=12):  83%|▊| 8333/10000 [32:03<02:[A
-Tokenizing Prompts (num_proc=12):  92%|▉| 9166/10000 [40:08<03:[A
-Tokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [59:41<00[ATokenizing Prompts (num_proc=12): 100%|█| 10000/10000 [59:43<00
-
-Dropping Long Sequences:   0%| | 0/10000 [00:00<?, ? examples/s[A
-Dropping Long Sequences:  10%| | 1000/10000 [00:10<01:35, 94.38[A
-Dropping Long Sequences:  20%|▏| 2000/10000 [00:18<01:11, 112.0[A
-Dropping Long Sequences:  30%|▎| 3000/10000 [00:26<00:59, 117.9[A
-Dropping Long Sequences:  40%|▍| 4000/10000 [00:33<00:48, 124.0[A
-Dropping Long Sequences:  50%|▌| 5000/10000 [00:42<00:41, 120.4[A
-Dropping Long Sequences:  60%|▌| 6000/10000 [00:50<00:33, 119.6[A
-Dropping Long Sequences:  70%|▋| 7000/10000 [00:58<00:24, 121.2[A
-Dropping Long Sequences:  80%|▊| 8000/10000 [01:07<00:16, 121.6[A
-Dropping Long Sequences:  90%|▉| 9000/10000 [01:15<00:08, 122.2[A
-Dropping Long Sequences: 100%|█| 10000/10000 [01:23<00:00, 122.[ADropping Long Sequences: 100%|█| 10000/10000 [01:23<00:00, 119.
-
-Add position_id column (Pretraining Sample Packing):   0%| | 0/[A
-Add position_id column (Pretraining Sample Packing):  53%|▌| 10[A
-Add position_id column (Pretraining Sample Packing): 100%|█| 19[AAdd position_id column (Pretraining Sample Packing): 100%|█| 19
-[2025-10-19 00:54:45,862] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42528] Using single process for pack_parallel, running sequentially.
- 88%|██████████████  | 880/1000 [5:52:14<37:16:08, 1118.07s/it]                                                               {'loss': 2.1909, 'grad_norm': 0.8694174289703369, 'learning_rate': 7.581091586847522e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1046.12, 'epoch': 0.88}
- 88%|██████████████  | 880/1000 [5:52:14<37:16:08, 1118.07s/it] 88%|██████████████▉  | 881/1000 [5:52:22<25:56:40, 784.88s/it]                                                               {'loss': 2.2759, 'grad_norm': 0.6799465417861938, 'learning_rate': 7.457877035729588e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1298.07, 'epoch': 0.88}
- 88%|██████████████▉  | 881/1000 [5:52:22<25:56:40, 784.88s/it] 88%|██████████████▉  | 882/1000 [5:52:29<18:04:54, 551.65s/it]                                                               {'loss': 2.3176, 'grad_norm': 0.7321369051933289, 'learning_rate': 7.335633207691361e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1124.03, 'epoch': 0.88}
- 88%|██████████████▉  | 882/1000 [5:52:29<18:04:54, 551.65s/it] 88%|███████████████  | 883/1000 [5:52:37<12:37:22, 388.40s/it]                                                               {'loss': 2.4338, 'grad_norm': 0.8011580109596252, 'learning_rate': 7.21436138501278e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1098.04, 'epoch': 0.88}
- 88%|███████████████  | 883/1000 [5:52:37<12:37:22, 388.40s/it] 88%|███████████████▉  | 884/1000 [5:52:44<8:49:58, 274.13s/it]                                                               {'loss': 2.2718, 'grad_norm': 0.8456489443778992, 'learning_rate': 7.094062839777837e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 900.37, 'epoch': 0.88}
- 88%|███████████████▉  | 884/1000 [5:52:44<8:49:58, 274.13s/it] 88%|███████████████▉  | 885/1000 [5:52:52<6:12:07, 194.15s/it]                                                               {'loss': 2.5104, 'grad_norm': 0.7948476076126099, 'learning_rate': 6.974738833861383e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1036.61, 'epoch': 0.89}
- 88%|███████████████▉  | 885/1000 [5:52:52<6:12:07, 194.15s/it] 89%|███████████████▉  | 886/1000 [5:52:59<4:22:30, 138.16s/it]                                                               {'loss': 2.0426, 'grad_norm': 0.8351035714149475, 'learning_rate': 6.856390618915775e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 876.02, 'epoch': 0.89}
- 89%|███████████████▉  | 886/1000 [5:52:59<4:22:30, 138.16s/it] 89%|████████████████▊  | 887/1000 [5:53:07<3:06:23, 98.97s/it]                                                               {'loss': 2.3276, 'grad_norm': 0.8119566440582275, 'learning_rate': 6.739019436357774e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 923.08, 'epoch': 0.89}
- 89%|████████████████▊  | 887/1000 [5:53:07<3:06:23, 98.97s/it] 89%|████████████████▊  | 888/1000 [5:53:14<2:13:32, 71.54s/it]                                                               {'loss': 2.2628, 'grad_norm': 0.7975265979766846, 'learning_rate': 6.622626517355557e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1060.33, 'epoch': 0.89}
- 89%|████████████████▊  | 888/1000 [5:53:14<2:13:32, 71.54s/it] 89%|████████████████▉  | 889/1000 [5:53:22<1:36:51, 52.35s/it]                                                               {'loss': 2.2452, 'grad_norm': 0.877912700176239, 'learning_rate': 6.507213082815744e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 918.74, 'epoch': 0.89}
- 89%|████████████████▉  | 889/1000 [5:53:22<1:36:51, 52.35s/it] 89%|████████████████▉  | 890/1000 [5:53:29<1:11:21, 38.93s/it]                                                               {'loss': 2.1007, 'grad_norm': 0.863393247127533, 'learning_rate': 6.392780343370686e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 802.44, 'epoch': 0.89}
- 89%|████████████████▉  | 890/1000 [5:53:29<1:11:21, 38.93s/it] 89%|██████████████████▋  | 891/1000 [5:53:37<53:38, 29.52s/it]                                                               {'loss': 2.3517, 'grad_norm': 0.7911595702171326, 'learning_rate': 6.2793294993656494e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1019.42, 'epoch': 0.89}
- 89%|██████████████████▋  | 891/1000 [5:53:37<53:38, 29.52s/it] 89%|██████████████████▋  | 892/1000 [5:53:45<41:18, 22.95s/it]                                                               {'loss': 2.2538, 'grad_norm': 0.8061058521270752, 'learning_rate': 6.166861740846297e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1054.17, 'epoch': 0.89}
- 89%|██████████████████▋  | 892/1000 [5:53:45<41:18, 22.95s/it] 89%|██████████████████▊  | 893/1000 [5:53:52<32:44, 18.36s/it]                                                               {'loss': 2.3443, 'grad_norm': 0.8848993182182312, 'learning_rate': 6.055378247546218e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 851.26, 'epoch': 0.89}
- 89%|██████████████████▊  | 893/1000 [5:53:52<32:44, 18.36s/it] 89%|██████████████████▊  | 894/1000 [5:54:00<26:45, 15.15s/it]                                                               {'loss': 2.3407, 'grad_norm': 0.8788235783576965, 'learning_rate': 5.9448801888744795e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 903.59, 'epoch': 0.89}
- 89%|██████████████████▊  | 894/1000 [5:54:00<26:45, 15.15s/it] 90%|██████████████████▊  | 895/1000 [5:54:08<22:35, 12.91s/it]                                                               {'loss': 2.3428, 'grad_norm': 1.0063735246658325, 'learning_rate': 5.835368723903456e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 696.4, 'epoch': 0.9}
- 90%|██████████████████▊  | 895/1000 [5:54:08<22:35, 12.91s/it] 90%|██████████████████▊  | 896/1000 [5:54:15<19:38, 11.33s/it]                                                               {'loss': 2.4691, 'grad_norm': 0.77157062292099, 'learning_rate': 5.726845001356573e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1103.4, 'epoch': 0.9}
- 90%|██████████████████▊  | 896/1000 [5:54:15<19:38, 11.33s/it] 90%|██████████████████▊  | 897/1000 [5:54:23<17:34, 10.24s/it]                                                               {'loss': 2.2848, 'grad_norm': 0.7940182685852051, 'learning_rate': 5.6193101595963585e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1266.59, 'epoch': 0.9}
- 90%|██████████████████▊  | 897/1000 [5:54:23<17:34, 10.24s/it] 90%|██████████████████▊  | 898/1000 [5:54:31<16:06,  9.47s/it]                                                               {'loss': 2.3828, 'grad_norm': 0.8417141437530518, 'learning_rate': 5.512765326612379e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 966.81, 'epoch': 0.9}
- 90%|██████████████████▊  | 898/1000 [5:54:31<16:06,  9.47s/it] 90%|█��████████████████▉  | 899/1000 [5:54:38<15:02,  8.94s/it]                                                               {'loss': 2.385, 'grad_norm': 0.7657260298728943, 'learning_rate': 5.407211620009544e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1132.73, 'epoch': 0.9}
- 90%|██████████████████▉  | 899/1000 [5:54:38<15:02,  8.94s/it] 90%|██████████████████▉  | 900/1000 [5:54:46<14:17,  8.57s/it]                                                               {'loss': 2.404, 'grad_norm': 0.777381956577301, 'learning_rate': 5.30265014699628e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1103.1, 'epoch': 0.9}
- 90%|██████████████████▉  | 900/1000 [5:54:46<14:17,  8.57s/it][2025-10-19 00:57:33,233] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-19 00:57:36,248] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4523584842681885
-[2025-10-19 00:57:37,627] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.378978967666626
-[2025-10-19 00:57:39,103] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4754667282104492
-[2025-10-19 00:57:40,530] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4263489246368408
-[2025-10-19 00:57:40,530] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.21it/s][A
-  2%|▍                         | 3/179 [00:00<00:42,  4.17it/s][A
-  2%|▌                         | 4/179 [00:00<00:47,  3.72it/s][A
-  3%|▋                         | 5/179 [00:01<01:19,  2.20it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.67it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.79it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.42it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.66it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.78it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.87it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.47it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.79it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.88it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.47it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.69it/s][A
- 11%|██▋                      | 19/179 [00:06<00:56,  2.81it/s][A
- 11%|██▊                      | 20/179 [00:07<00:54,  2.89it/s][A
- 12%|██▉                      | 21/179 [00:07<01:03,  2.48it/s][A
- 12%|███                      | 22/179 [00:07<00:57,  2.71it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.81it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.88it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.70it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.80it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.46it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.68it/s][A
- 17%|████▎                    | 31/179 [00:11<00:53,  2.79it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.85it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.46it/s][A
- 19%|████▋                    | 34/179 [00:12<00:54,  2.68it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.78it/s][A
- 20%|█████                    | 36/179 [00:13<00:50,  2.84it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.46it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.67it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:50,  2.78it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.86it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:56,  2.46it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:51,  2.67it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.79it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:47,  2.86it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:50,  2.66it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.77it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.87it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:53,  2.44it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.78it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.45it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.68it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.78it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.86it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.45it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.81it/s][A
- 34%|████████▍                | 60/179 [00:22<00:41,  2.86it/s][A
- 34%|████████▌                | 61/179 [00:22<00:47,  2.47it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:40,  2.86it/s][A
- 36%|█████████                | 65/179 [00:24<00:46,  2.47it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.69it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.79it/s][A
- 38%|█████████▍               | 68/179 [00:25<00:39,  2.83it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.45it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.67it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|██████████▏              | 73/179 [00:27<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:39,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.86it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.46it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.47it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:36,  2.69it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.79it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.87it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.46it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.70it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:32,  2.80it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.89it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.49it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:32,  2.70it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.80it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.88it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.48it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.68it/s][A
- 53%|█████████████▎           | 95/179 [00:35<00:30,  2.79it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.88it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.70it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.80it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.87it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.70it/s][A
- 58%|█████████████▊          | 103/179 [00:38<00:27,  2.80it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.87it/s][A
- 59%|██████████████          | 105/179 [00:38<00:29,  2.47it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:26,  2.71it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.80it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.87it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.48it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.70it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.80it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.87it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.48it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.68it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.79it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:22,  2.86it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.68it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.79it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.86it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.70it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:19,  2.81it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.46it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.80it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.87it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.69it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.80it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.45it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.67it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.77it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.84it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.47it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.68it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.78it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.85it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.47it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.68it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.77it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.45it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.67it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.70it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:10,  2.80it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.85it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.45it/s][A
- 86%|████████████████████▋   | 154/179 [00:57<00:09,  2.70it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.81it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:07,  2.88it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.71it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.81it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.88it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.88it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.48it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.69it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.80it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.88it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.69it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.79it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.86it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.45it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.67it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.78it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.86it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.46it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.69it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.52it/s][A                                                               
-                                                               [A{'eval_loss': 2.2489285469055176, 'eval_runtime': 68.5668, 'eval_samples_per_second': 2.859, 'eval_steps_per_second': 1.429, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.9}
- 90%|██████████████████▉  | 900/1000 [5:56:02<14:17,  8.57s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.52it/s][A
-                                                               [A[2025-10-19 00:58:49,105] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-900
- 90%|██████████████████▉  | 901/1000 [5:56:12<52:29, 31.81s/it]                                                               {'loss': 2.4762, 'grad_norm': 0.8949609398841858, 'learning_rate': 5.199082004372957e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 871.1, 'epoch': 0.9}
- 90%|██████████████████▉  | 901/1000 [5:56:12<52:29, 31.81s/it] 90%|██████████████████▉  | 902/1000 [5:56:20<40:08, 24.58s/it]                                                               {'loss': 2.4545, 'grad_norm': 0.9127408266067505, 'learning_rate': 5.096508278520384e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 867.49, 'epoch': 0.9}
- 90%|██████████████████▉  | 902/1000 [5:56:20<40:08, 24.58s/it] 90%|██████████████████▉  | 903/1000 [5:56:28<31:32, 19.51s/it]                                                               {'loss': 2.2399, 'grad_norm': 0.7866583466529846, 'learning_rate': 4.994930045388413e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1036.86, 'epoch': 0.9}
- 90%|██████████████████▉  | 903/1000 [5:56:28<31:32, 19.51s/it] 90%|██████████████████▉  | 904/1000 [5:56:35<25:32, 15.96s/it]                                                               {'loss': 2.3419, 'grad_norm': 0.8697656393051147, 'learning_rate': 4.8943483704846475e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 863.85, 'epoch': 0.9}
- 90%|██████████████████▉  | 904/1000 [5:56:35<25:32, 15.96s/it] 90%|███████████████████  | 905/1000 [5:56:43<21:20, 13.48s/it]                                                               {'loss': 2.4656, 'grad_norm': 0.7631382346153259, 'learning_rate': 4.794764308863242e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1133.99, 'epoch': 0.91}
- 90%|███████████████████  | 905/1000 [5:56:43<21:20, 13.48s/it] 91%|███████████████████  | 906/1000 [5:56:51<18:24, 11.75s/it]                                                               {'loss': 2.2669, 'grad_norm': 0.8156150579452515, 'learning_rate': 4.6961789051139124e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 973.46, 'epoch': 0.91}
- 91%|███████████████████  | 906/1000 [5:56:51<18:24, 11.75s/it] 91%|███████████████████  | 907/1000 [5:56:58<16:19, 10.53s/it]                                                               {'loss': 2.4605, 'grad_norm': 0.8866649866104126, 'learning_rate': 4.5985931933508754e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 940.69, 'epoch': 0.91}
- 91%|███████████████████  | 907/1000 [5:56:58<16:19, 10.53s/it] 91%|███████████████████  | 908/1000 [5:57:06<14:50,  9.68s/it]                                                               {'loss': 2.2944, 'grad_norm': 0.8710980415344238, 'learning_rate': 4.502008197202068e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 998.46, 'epoch': 0.91}
- 91%|███████████████████  | 908/1000 [5:57:06<14:50,  9.68s/it] 91%|███████████████████  | 909/1000 [5:57:14<13:46,  9.09s/it]                                                               {'loss': 2.2844, 'grad_norm': 0.890874981880188, 'learning_rate': 4.406424929798403e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 836.5, 'epoch': 0.91}
- 91%|███████████████████  | 909/1000 [5:57:14<13:46,  9.09s/it] 91%|███████████████████  | 910/1000 [5:57:21<13:00,  8.67s/it]                                                               {'loss': 2.2169, 'grad_norm': 0.9189761877059937, 'learning_rate': 4.311844393763109e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 682.22, 'epoch': 0.91}
- 91%|███████████████████  | 910/1000 [5:57:21<13:00,  8.67s/it] 91%|███████████████████▏ | 911/1000 [5:57:29<12:25,  8.38s/it]                                                               {'loss': 2.312, 'grad_norm': 0.7664890289306641, 'learning_rate': 4.2182675812012965e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1176.88, 'epoch': 0.91}
- 91%|███████████████████▏ | 911/1000 [5:57:29<12:25,  8.38s/it] 91%|███████████████████▏ | 912/1000 [5:57:37<11:59,  8.18s/it]                                                               {'loss': 2.3113, 'grad_norm': 0.8774908781051636, 'learning_rate': 4.125695473689406e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1124.24, 'epoch': 0.91}
- 91%|███████████████████▏ | 912/1000 [5:57:37<11:59,  8.18s/it] 91%|███████████████████▏ | 913/1000 [5:57:45<11:39,  8.04s/it]                                                               {'loss': 2.2331, 'grad_norm': 0.7188146710395813, 'learning_rate': 4.034129042265066e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1169.77, 'epoch': 0.91}
- 91%|███████████████████▏ | 913/1000 [5:57:45<11:39,  8.04s/it] 91%|███████████████████▏ | 914/1000 [5:57:52<11:22,  7.93s/it]                                                               {'loss': 2.2158, 'grad_norm': 0.7439635992050171, 'learning_rate': 3.943569247416801e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1069.21, 'epoch': 0.91}
- 91%|███████████████████▏ | 914/1000 [5:57:52<11:22,  7.93s/it] 92%|███████████████████▏ | 915/1000 [5:58:00<11:08,  7.86s/it]                                                               {'loss': 2.0706, 'grad_norm': 0.8696659803390503, 'learning_rate': 3.854017039074009e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 839.46, 'epoch': 0.92}
- 92%|███████████████████▏ | 915/1000 [5:58:00<11:08,  7.86s/it] 92%|███████████████████▏ | 916/1000 [5:58:08<10:56,  7.81s/it]                                                               {'loss': 2.1532, 'grad_norm': 0.8310878872871399, 'learning_rate': 3.7654733565969826e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 962.6, 'epoch': 0.92}
- 92%|███████████████████▏ | 916/1000 [5:58:08<10:56,  7.81s/it] 92%|███████████████████▎ | 917/1000 [5:58:15<10:46,  7.79s/it]                                                               {'loss': 2.3169, 'grad_norm': 0.7939152121543884, 'learning_rate': 3.6779391287670494e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1062.65, 'epoch': 0.92}
- 92%|███████████████████▎ | 917/1000 [5:58:15<10:46,  7.79s/it] 92%|███████████████████▎ | 918/1000 [5:58:23<10:37,  7.77s/it]                                                               {'loss': 2.3924, 'grad_norm': 0.7757739424705505, 'learning_rate': 3.591415273776855e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1109.86, 'epoch': 0.92}
- 92%|███████████████████▎ | 918/1000 [5:58:23<10:37,  7.77s/it] 92%|███████████████████▎ | 919/1000 [5:58:31<10:28,  7.76s/it]                                                               {'loss': 2.28, 'grad_norm': 0.8107742667198181, 'learning_rate': 3.5059026992206647e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 966.1, 'epoch': 0.92}
- 92%|███████████████████▎ | 919/1000 [5:58:31<10:28,  7.76s/it] 92%|███████████████████▎ | 920/1000 [5:58:39<10:19,  7.75s/it]                                                               {'loss': 2.1964, 'grad_norm': 0.7709587812423706, 'learning_rate': 3.421402302084953e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1118.99, 'epoch': 0.92}
- 92%|███████████████████▎ | 920/1000 [5:58:39<10:19,  7.75s/it] 92%|███████████████████▎ | 921/1000 [5:58:46<10:10,  7.73s/it]                                                               {'loss': 2.3611, 'grad_norm': 1.0184322595596313, 'learning_rate': 3.3379149687388867e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 714.28, 'epoch': 0.92}
- 92%|███████████████████▎ | 921/1000 [5:58:46<10:10,  7.73s/it] 92%|███████████████████▎ | 922/1000 [5:58:54<10:02,  7.73s/it]                                                               {'loss': 2.2054, 'grad_norm': 0.7510554790496826, 'learning_rate': 3.2554415749250888e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1274.51, 'epoch': 0.92}
- 92%|███████████████████▎ | 922/1000 [5:58:54<10:02,  7.73s/it] 92%|███████████████████▍ | 923/1000 [5:59:02<09:54,  7.73s/it]                                                               {'loss': 2.343, 'grad_norm': 0.7551178932189941, 'learning_rate': 3.1739829857504234e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1083.24, 'epoch': 0.92}
- 92%|███████████████████▍ | 923/1000 [5:59:02<09:54,  7.73s/it] 92%|███████████████████▍ | 924/1000 [5:59:09<09:46,  7.72s/it]                                                               {'loss': 2.4333, 'grad_norm': 0.8899993300437927, 'learning_rate': 3.093540055676958e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1079.69, 'epoch': 0.92}
- 92%|███████████████████▍ | 924/1000 [5:59:09<09:46,  7.72s/it] 92%|███████████████████▍ | 925/1000 [5:59:17<09:38,  7.71s/it]                                                               {'loss': 2.3467, 'grad_norm': 1.0318634510040283, 'learning_rate': 3.014113628512982e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 697.9, 'epoch': 0.93}
- 92%|███████████████████▍ | 925/1000 [5:59:17<09:38,  7.71s/it] 93%|███████████████████▍ | 926/1000 [5:59:25<09:30,  7.71s/it]                                                               {'loss': 2.5106, 'grad_norm': 0.8982748985290527, 'learning_rate': 2.9357045374040825e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 903.38, 'epoch': 0.93}
- 93%|███████████████████▍ | 926/1000 [5:59:25<09:30,  7.71s/it] 93%|███████████████████▍ | 927/1000 [5:59:32<09:22,  7.71s/it]                                                               {'loss': 2.4083, 'grad_norm': 0.8162830471992493, 'learning_rate': 2.8583136048245697e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1138.53, 'epoch': 0.93}
- 93%|███████████████████▍ | 927/1000 [5:59:32<09:22,  7.71s/it] 93%|███████████████████▍ | 928/1000 [5:59:40<09:15,  7.72s/it]                                                               {'loss': 2.276, 'grad_norm': 0.8001974821090698, 'learning_rate': 2.781941642568686e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1132.56, 'epoch': 0.93}
- 93%|███████████████████▍ | 928/1000 [5:59:40<09:15,  7.72s/it] 93%|███████████████████▌ | 929/1000 [5:59:48<09:07,  7.72s/it]                                                               {'loss': 2.2492, 'grad_norm': 0.7837321758270264, 'learning_rate': 2.706589451742181e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1038.68, 'epoch': 0.93}
- 93%|███████████████████▌ | 929/1000 [5:59:48<09:07,  7.72s/it] 93%|███████████████████▌ | 930/1000 [5:59:56<09:00,  7.72s/it]                                                               {'loss': 2.3437, 'grad_norm': 0.7401877641677856, 'learning_rate': 2.632257822753881e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1228.53, 'epoch': 0.93}
- 93%|███████████████████▌ | 930/1000 [5:59:56<09:00,  7.72s/it] 93%|███████████████████▌ | 931/1000 [6:00:03<08:52,  7.71s/it]                                                               {'loss': 2.4389, 'grad_norm': 0.8554418087005615, 'learning_rate': 2.5589475353073988e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 885.87, 'epoch': 0.93}
- 93%|███████████████████▌ | 931/1000 [6:00:03<08:52,  7.71s/it] 93%|███████████████���███▌ | 932/1000 [6:00:11<08:43,  7.70s/it]                                                               {'loss': 2.3406, 'grad_norm': 0.9192164540290833, 'learning_rate': 2.486659358392951e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 763.32, 'epoch': 0.93}
- 93%|███████████████████▌ | 932/1000 [6:00:11<08:43,  7.70s/it] 93%|███████████████████▌ | 933/1000 [6:00:19<08:35,  7.70s/it]                                                               {'loss': 2.3083, 'grad_norm': 0.942592978477478, 'learning_rate': 2.415394050279318e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 814.78, 'epoch': 0.93}
- 93%|███████████████████▌ | 933/1000 [6:00:19<08:35,  7.70s/it] 93%|███████████████████▌ | 934/1000 [6:00:26<08:28,  7.70s/it]                                                               {'loss': 2.2795, 'grad_norm': 0.8741790652275085, 'learning_rate': 2.3451523585058754e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 769.67, 'epoch': 0.93}
- 93%|███████████████████▌ | 934/1000 [6:00:26<08:28,  7.70s/it] 94%|███████████████████▋ | 935/1000 [6:00:34<08:20,  7.70s/it]                                                               {'loss': 2.2822, 'grad_norm': 0.8478880524635315, 'learning_rate': 2.2759350198746976e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 941.18, 'epoch': 0.94}
- 94%|███████████████████▋ | 935/1000 [6:00:34<08:20,  7.70s/it] 94%|███████████████████▋ | 936/1000 [6:00:42<08:12,  7.70s/it]                                                               {'loss': 2.2101, 'grad_norm': 0.812393307685852, 'learning_rate': 2.2077427604429433e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 973.19, 'epoch': 0.94}
- 94%|███████████████████▋ | 936/1000 [6:00:42<08:12,  7.70s/it] 94%|███████████████████▋ | 937/1000 [6:00:50<08:05,  7.70s/it]                                                               {'loss': 2.2674, 'grad_norm': 0.7618114948272705, 'learning_rate': 2.1405762955151176e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1181.79, 'epoch': 0.94}
- 94%|███████████████████▋ | 937/1000 [6:00:50<08:05,  7.70s/it] 94%|███████████████████▋ | 938/1000 [6:00:57<07:57,  7.70s/it]                                                               {'loss': 2.2749, 'grad_norm': 0.8220508694648743, 'learning_rate': 2.074436329635687e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1221.55, 'epoch': 0.94}
- 94%|███████████████████▋ | 938/1000 [6:00:57<07:57,  7.70s/it] 94%|███████████████████▋ | 939/1000 [6:01:05<07:49,  7.70s/it]                                                               {'loss': 2.4388, 'grad_norm': 1.0159056186676025, 'learning_rate': 2.009323556581566e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 687.16, 'epoch': 0.94}
- 94%|███████████████████▋ | 939/1000 [6:01:05<07:49,  7.70s/it] 94%|███████████████████▋ | 940/1000 [6:01:13<07:42,  7.71s/it]                                                               {'loss': 2.2667, 'grad_norm': 0.7561337351799011, 'learning_rate': 1.945238659354953e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1281.35, 'epoch': 0.94}
- 94%|███████████████████▋ | 940/1000 [6:01:13<07:42,  7.71s/it] 94%|███████████████████▊ | 941/1000 [6:01:20<07:34,  7.70s/it]                                                               {'loss': 2.2573, 'grad_norm': 0.8838269114494324, 'learning_rate': 1.882182310176095e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 806.7, 'epoch': 0.94}
- 94%|███████████████████▊ | 941/1000 [6:01:20<07:34,  7.70s/it] 94%|███████████████████▊ | 942/1000 [6:01:28<07:26,  7.70s/it]                                                               {'loss': 2.2695, 'grad_norm': 0.8877288699150085, 'learning_rate': 1.8201551704762453e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 806.43, 'epoch': 0.94}
- 94%|███████████████████▊ | 942/1000 [6:01:28<07:26,  7.70s/it] 94%|███████████████████▊ | 943/1000 [6:01:36<07:18,  7.70s/it]                                                               {'loss': 2.2074, 'grad_norm': 0.8402616381645203, 'learning_rate': 1.7591578908907724e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 915.82, 'epoch': 0.94}
- 94%|███████████████████▊ | 943/1000 [6:01:36<07:18,  7.70s/it] 94%|███████████████████▊ | 944/1000 [6:01:43<07:10,  7.69s/it]                                                               {'loss': 2.4728, 'grad_norm': 1.0581202507019043, 'learning_rate': 1.6991911112522407e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 724.71, 'epoch': 0.94}
- 94%|███████████████████▊ | 944/1000 [6:01:43<07:10,  7.69s/it] 94%|███████████████████▊ | 945/1000 [6:01:51<07:03,  7.70s/it]                                                               {'loss': 2.2689, 'grad_norm': 0.9054380059242249, 'learning_rate': 1.6402554605838172e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 827.4, 'epoch': 0.94}
- 94%|███████████████████▊ | 945/1000 [6:01:51<07:03,  7.70s/it] 95%|███████████████████▊ | 946/1000 [6:01:59<06:55,  7.70s/it]                                                               {'loss': 2.1174, 'grad_norm': 0.8485203385353088, 'learning_rate': 1.5823515570925763e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 803.63, 'epoch': 0.95}
- 95%|███████████████████▊ | 946/1000 [6:01:59<06:55,  7.70s/it] 95%|███████████████████▉ | 947/1000 [6:02:07<06:48,  7.70s/it]                                                               {'loss': 2.0874, 'grad_norm': 1.024441123008728, 'learning_rate': 1.5254800081630826e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 604.82, 'epoch': 0.95}
- 95%|███████████████████▉ | 947/1000 [6:02:07<06:48,  7.70s/it] 95%|███████████████████▉ | 948/1000 [6:02:14<06:40,  7.70s/it]                                                               {'loss': 2.2538, 'grad_norm': 0.86529141664505, 'learning_rate': 1.4696414103509636e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 814.58, 'epoch': 0.95}
- 95%|███████████████████▉ | 948/1000 [6:02:14<06:40,  7.70s/it] 95%|███████████████████▉ | 949/1000 [6:02:22<06:33,  7.71s/it]                                                               {'loss': 2.1989, 'grad_norm': 0.8138614892959595, 'learning_rate': 1.4148363493766802e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1085.71, 'epoch': 0.95}
- 95%|███████████████████▉ | 949/1000 [6:02:22<06:33,  7.71s/it] 95%|███████████████████▉ | 950/1000 [6:02:30<06:25,  7.71s/it]                                                               {'loss': 2.4012, 'grad_norm': 0.9980276823043823, 'learning_rate': 1.361065400119399e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 787.51, 'epoch': 0.95}
- 95%|███████████████████▉ | 950/1000 [6:02:30<06:25,  7.71s/it][2025-10-19 01:05:16,775] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-19 01:05:19,765] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4397432804107666
-[2025-10-19 01:05:21,175] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4104745388031006
-[2025-10-19 01:05:22,610] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4338619709014893
-[2025-10-19 01:05:24,000] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3894948959350586
-[2025-10-19 01:05:24,000] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.20it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.35it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.78it/s][A
-  3%|▋                         | 5/179 [00:01<01:18,  2.21it/s][A
-  3%|▊                         | 6/179 [00:02<01:09,  2.50it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.66it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.78it/s][A
-  5%|█▎                        | 9/179 [00:03<01:10,  2.42it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.66it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.79it/s][A
-  7%|█▋                       | 12/179 [00:04<00:58,  2.86it/s][A
-  7%|█▊                       | 13/179 [00:04<01:07,  2.45it/s][A
-  8%|█▉                       | 14/179 [00:05<01:01,  2.68it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.78it/s][A
-  9%|██▏                      | 16/179 [00:05<00:57,  2.86it/s][A
-  9%|██▎                      | 17/179 [00:06<01:05,  2.46it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.68it/s][A
- 11%|██▋                      | 19/179 [00:06<00:57,  2.79it/s][A
- 11%|██▊                      | 20/179 [00:07<00:55,  2.88it/s][A
- 12%|██▉                      | 21/179 [00:07<01:04,  2.46it/s][A
- 12%|███                      | 22/179 [00:07<00:58,  2.69it/s][A
- 13%|███▏                     | 23/179 [00:08<00:55,  2.80it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.89it/s][A
- 14%|███▍                     | 25/179 [00:09<01:02,  2.47it/s][A
- 15%|███▋                     | 26/179 [00:09<00:56,  2.69it/s][A
- 15%|███▊                     | 27/179 [00:09<00:54,  2.79it/s][A
- 16%|███▉                     | 28/179 [00:10<00:52,  2.87it/s][A
- 16%|████                     | 29/179 [00:10<01:00,  2.47it/s][A
- 17%|████▏                    | 30/179 [00:10<00:55,  2.69it/s][A
- 17%|████▎                    | 31/179 [00:11<00:52,  2.80it/s][A
- 18%|████▍                    | 32/179 [00:11<00:51,  2.88it/s][A
- 18%|████▌                    | 33/179 [00:12<00:59,  2.47it/s][A
- 19%|████▋                    | 34/179 [00:12<00:53,  2.70it/s][A
- 20%|████▉                    | 35/179 [00:12<00:51,  2.80it/s][A
- 20%|█████                    | 36/179 [00:13<00:49,  2.88it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:57,  2.46it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:52,  2.71it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.81it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:48,  2.88it/s][A
- 23%|█████▋                   | 41/179 [00:15<00:55,  2.48it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:50,  2.71it/s][A
- 24%|██████                   | 43/179 [00:15<00:48,  2.82it/s][A
- 25%|██████▏                  | 44/179 [00:16<00:46,  2.88it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:54,  2.47it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:49,  2.70it/s][A
- 26%|██████▌                  | 47/179 [00:17<00:47,  2.80it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:45,  2.88it/s][A
- 27%|██████▊                  | 49/179 [00:18<00:52,  2.47it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:48,  2.68it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.79it/s][A
- 29%|███████▎                 | 52/179 [00:19<00:44,  2.87it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:51,  2.47it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:46,  2.69it/s][A
- 31%|███████▋                 | 55/179 [00:20<00:44,  2.79it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.88it/s][A
- 32%|███████▉                 | 57/179 [00:21<00:49,  2.47it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.70it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.80it/s][A
- 34%|████████▍                | 60/179 [00:21<00:41,  2.87it/s][A
- 34%|████████▌                | 61/179 [00:22<00:48,  2.44it/s][A
- 35%|████████▋                | 62/179 [00:22<00:43,  2.68it/s][A
- 35%|████████▊                | 63/179 [00:23<00:41,  2.79it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.88it/s][A
- 36%|█████████                | 65/179 [00:23<00:46,  2.48it/s][A
- 37%|█████████▏               | 66/179 [00:24<00:41,  2.70it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:40,  2.80it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:38,  2.88it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:44,  2.46it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:40,  2.70it/s][A
- 40%|█████████▉               | 71/179 [00:26<00:38,  2.80it/s][A
- 40%|██████████               | 72/179 [00:26<00:37,  2.87it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.47it/s][A
- 41%|██████████▎              | 74/179 [00:27<00:38,  2.69it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:37,  2.80it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.88it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:41,  2.47it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:37,  2.69it/s][A
- 44%|███████████              | 79/179 [00:29<00:35,  2.79it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:34,  2.87it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.46it/s][A
- 46%|███████████▍             | 82/179 [00:30<00:36,  2.68it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.78it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:33,  2.85it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:38,  2.45it/s][A
- 48%|████████████             | 86/179 [00:31<00:34,  2.69it/s][A
- 49%|████████████▏            | 87/179 [00:32<00:33,  2.78it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:31,  2.86it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:36,  2.47it/s][A
- 50%|████████████▌            | 90/179 [00:33<00:32,  2.70it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.80it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.88it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.48it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.69it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:30,  2.80it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.87it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:33,  2.47it/s][A
- 55%|█████████████▋           | 98/179 [00:36<00:30,  2.68it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.79it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.85it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.46it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.68it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:27,  2.78it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:26,  2.86it/s][A
- 59%|██████████████          | 105/179 [00:38<00:30,  2.46it/s][A
- 59%|██████████████▏         | 106/179 [00:39<00:27,  2.69it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.78it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.85it/s][A
- 61%|██████████████▌         | 109/179 [00:40<00:28,  2.45it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.68it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:24,  2.78it/s][A
- 63%|███████████████         | 112/179 [00:41<00:23,  2.86it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.46it/s][A
- 64%|███████████████▎        | 114/179 [00:42<00:24,  2.69it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.80it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.87it/s][A
- 65%|███████████████▋        | 117/179 [00:43<00:25,  2.46it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.69it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.78it/s][A
- 67%|████████████████        | 120/179 [00:44<00:20,  2.87it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.47it/s][A
- 68%|████████████████▎       | 122/179 [00:45<00:21,  2.69it/s][A
- 69%|████████████████▍       | 123/179 [00:45<00:20,  2.78it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:19,  2.87it/s][A
- 70%|████████████████▊       | 125/179 [00:46<00:21,  2.48it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.69it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.79it/s][A
- 72%|█████████████████▏      | 128/179 [00:47<00:17,  2.86it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:20,  2.46it/s][A
- 73%|█████████████████▍      | 130/179 [00:48<00:18,  2.67it/s][A
- 73%|█████████████████▌      | 131/179 [00:48<00:17,  2.79it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.87it/s][A
- 74%|█████████████████▊      | 133/179 [00:49<00:18,  2.47it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.69it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.80it/s][A
- 76%|██████████████████▏     | 136/179 [00:50<00:15,  2.86it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:17,  2.46it/s][A
- 77%|██████████████████▌     | 138/179 [00:51<00:15,  2.69it/s][A
- 78%|██████████████████▋     | 139/179 [00:51<00:14,  2.80it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.88it/s][A
- 79%|██████████████████▉     | 141/179 [00:52<00:15,  2.46it/s][A
- 79%|███████████████████     | 142/179 [00:52<00:13,  2.69it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.78it/s][A
- 80%|███████████████████▎    | 144/179 [00:53<00:12,  2.85it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.44it/s][A
- 82%|███████████████████▌    | 146/179 [00:54<00:12,  2.68it/s][A
- 82%|███████████████████▋    | 147/179 [00:54<00:11,  2.79it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.87it/s][A
- 83%|███████████████████▉    | 149/179 [00:55<00:12,  2.47it/s][A
- 84%|████████████████████    | 150/179 [00:55<00:10,  2.70it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:09,  2.80it/s][A
- 85%|████████████████████▍   | 152/179 [00:56<00:09,  2.88it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.47it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.65it/s][A
- 87%|████████████████████▊   | 155/179 [00:57<00:08,  2.76it/s][A
- 87%|████████████████████▉   | 156/179 [00:57<00:08,  2.85it/s][A
- 88%|█████████████████████   | 157/179 [00:58<00:08,  2.46it/s][A
- 88%|█████████████████████▏  | 158/179 [00:58<00:07,  2.68it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.79it/s][A
- 89%|█████████████████████▍  | 160/179 [00:59<00:06,  2.87it/s][A
- 90%|█████████████████████▌  | 161/179 [00:59<00:07,  2.47it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.69it/s][A
- 91%|█████████████████████▊  | 163/179 [01:00<00:05,  2.79it/s][A
- 92%|█████████████████████▉  | 164/179 [01:00<00:05,  2.86it/s][A
- 92%|██████████████████████  | 165/179 [01:01<00:05,  2.47it/s][A
- 93%|██████████████████████▎ | 166/179 [01:01<00:04,  2.68it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.78it/s][A
- 94%|██████████████████████▌ | 168/179 [01:02<00:03,  2.87it/s][A
- 94%|██████████████████████▋ | 169/179 [01:02<00:04,  2.47it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.70it/s][A
- 96%|██████████████████████▉ | 171/179 [01:03<00:02,  2.80it/s][A
- 96%|███████████████████████ | 172/179 [01:03<00:02,  2.87it/s][A
- 97%|███████████████████████▏| 173/179 [01:04<00:02,  2.46it/s][A
- 97%|███████████████████████▎| 174/179 [01:04<00:01,  2.69it/s][A
- 98%|███████████████████████▍| 175/179 [01:04<00:01,  2.80it/s][A
- 98%|███████████████████████▌| 176/179 [01:05<00:01,  2.88it/s][A
- 99%|███████████████████████▋| 177/179 [01:05<00:00,  2.48it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.70it/s][A
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A                                                               
-                                                               [A{'eval_loss': 2.246039867401123, 'eval_runtime': 68.6355, 'eval_samples_per_second': 2.856, 'eval_steps_per_second': 1.428, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 0.95}
- 95%|███████████████████▉ | 950/1000 [6:03:46<06:25,  7.71s/it]
-100%|████████████████████████| 179/179 [01:06<00:00,  2.57it/s][A
-                                                               [A[2025-10-19 01:06:32,644] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-950
- 95%|███████████████████▉ | 951/1000 [6:03:56<25:30, 31.24s/it]                                                               {'loss': 2.287, 'grad_norm': 0.8099629878997803, 'learning_rate': 1.30832912661093e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.74, 'tokens_per_second_per_gpu': 997.07, 'epoch': 0.95}
- 95%|███████████████████▉ | 951/1000 [6:03:56<25:30, 31.24s/it] 95%|███████████████████▉ | 952/1000 [6:04:03<19:20, 24.18s/it]                                                               {'loss': 2.2643, 'grad_norm': 0.910591185092926, 'learning_rate': 1.2566280820298426e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 821.07, 'epoch': 0.95}
- 95%|███████████████████▉ | 952/1000 [6:04:03<19:20, 24.18s/it] 95%|████████████████████ | 953/1000 [6:04:11<15:03, 19.23s/it]                                                               {'loss': 2.3573, 'grad_norm': 0.8110288381576538, 'learning_rate': 1.2059628086956044e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1025.55, 'epoch': 0.95}
- 95%|████████████████████ | 953/1000 [6:04:11<15:03, 19.23s/it] 95%|████████████████████ | 954/1000 [6:04:19<12:05, 15.77s/it]                                                               {'loss': 2.5223, 'grad_norm': 0.8043859004974365, 'learning_rate': 1.1563338380629618e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.78, 'tokens_per_second_per_gpu': 1124.93, 'epoch': 0.95}
- 95%|████████████████████ | 954/1000 [6:04:19<12:05, 15.77s/it] 96%|████████████████████ | 955/1000 [6:04:27<10:00, 13.34s/it]                                                               {'loss': 2.1511, 'grad_norm': 0.7169449329376221, 'learning_rate': 1.1077416907163574e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1207.88, 'epoch': 0.95}
- 96%|████████████████████ | 955/1000 [6:04:27<10:00, 13.34s/it] 96%|████████████████████ | 956/1000 [6:04:34<08:32, 11.64s/it]                                                               {'loss': 2.3221, 'grad_norm': 0.9546728730201721, 'learning_rate': 1.0601868763643996e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 715.9, 'epoch': 0.96}
- 96%|████████████████████ | 956/1000 [6:04:34<08:32, 11.64s/it] 96%|████████████████████ | 957/1000 [6:04:42<07:29, 10.45s/it]                                                               {'loss': 2.4007, 'grad_norm': 0.9332824945449829, 'learning_rate': 1.0136698938346011e-06, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 909.16, 'epoch': 0.96}
- 96%|████████████████████ | 957/1000 [6:04:42<07:29, 10.45s/it] 96%|████████████████████ | 958/1000 [6:04:50<06:44,  9.62s/it]                                                               {'loss': 2.2667, 'grad_norm': 0.948166012763977, 'learning_rate': 9.68191231068083e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 739.18, 'epoch': 0.96}
- 96%|████████████████████ | 958/1000 [6:04:50<06:44,  9.62s/it] 96%|████████████████████▏| 959/1000 [6:04:57<06:11,  9.05s/it]                                                               {'loss': 2.1496, 'grad_norm': 0.7676699161529541, 'learning_rate': 9.237513651145225e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1144.45, 'epoch': 0.96}
- 96%|████████████████████▏| 959/1000 [6:04:57<06:11,  9.05s/it] 96%|████████████████████▏| 960/1000 [6:05:05<05:45,  8.64s/it]                                                               {'loss': 2.4495, 'grad_norm': 0.7553421854972839, 'learning_rate': 8.803507621270579e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1184.03, 'epoch': 0.96}
- 96%|████████████████████▏| 960/1000 [6:05:05<05:45,  8.64s/it] 96%|████████████████████▏| 961/1000 [6:05:13<05:26,  8.36s/it]                                                               {'loss': 2.4696, 'grad_norm': 0.8734245896339417, 'learning_rate': 8.379898773574924e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 985.05, 'epoch': 0.96}
- 96%|████████████████████▏| 961/1000 [6:05:13<05:26,  8.36s/it] 96%|████████████████████▏| 962/1000 [6:05:20<05:10,  8.16s/it]                                                               {'loss': 2.5366, 'grad_norm': 0.9553205370903015, 'learning_rate': 7.966691551514527e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 789.81, 'epoch': 0.96}
- 96%|████████████████████▏| 962/1000 [6:05:20<05:10,  8.16s/it] 96%|████████████████████▏| 963/1000 [6:05:28<04:56,  8.02s/it]                                                               {'loss': 2.3128, 'grad_norm': 1.0424203872680664, 'learning_rate': 7.563890289437825e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 642.35, 'epoch': 0.96}
- 96%|████████████████████▏| 963/1000 [6:05:28<04:56,  8.02s/it] 96%|████████████████████▏| 964/1000 [6:05:36<04:45,  7.92s/it]                                                               {'loss': 2.3017, 'grad_norm': 0.9237273931503296, 'learning_rate': 7.171499212539123e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 771.48, 'epoch': 0.96}
- 96%|████████████████████▏| 964/1000 [6:05:36<04:45,  7.92s/it] 96%|██���█████████████████▎| 965/1000 [6:05:43<04:35,  7.86s/it]                                                               {'loss': 2.3436, 'grad_norm': 0.821221649646759, 'learning_rate': 6.78952243681541e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1111.51, 'epoch': 0.96}
- 96%|████████████████████▎| 965/1000 [6:05:43<04:35,  7.86s/it] 97%|████████████████████▎| 966/1000 [6:05:51<04:25,  7.81s/it]                                                               {'loss': 2.0172, 'grad_norm': 0.8634496331214905, 'learning_rate': 6.41796396902239e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 789.7, 'epoch': 0.97}
- 97%|████████████████████▎| 966/1000 [6:05:51<04:25,  7.81s/it] 97%|████████████████████▎| 967/1000 [6:05:59<04:16,  7.77s/it]                                                               {'loss': 2.282, 'grad_norm': 0.8414502143859863, 'learning_rate': 6.056827706632185e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1027.47, 'epoch': 0.97}
- 97%|████████████████████▎| 967/1000 [6:05:59<04:16,  7.77s/it] 97%|████████████████████▎| 968/1000 [6:06:07<04:07,  7.75s/it]                                                               {'loss': 2.3397, 'grad_norm': 1.1495898962020874, 'learning_rate': 5.706117437793701e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 534.09, 'epoch': 0.97}
- 97%|████████████████████▎| 968/1000 [6:06:07<04:07,  7.75s/it] 97%|████████████████████▎| 969/1000 [6:06:14<03:59,  7.73s/it]                                                               {'loss': 2.4827, 'grad_norm': 0.8324930667877197, 'learning_rate': 5.365836841291438e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1116.92, 'epoch': 0.97}
- 97%|████████████████████▎| 969/1000 [6:06:14<03:59,  7.73s/it] 97%|████████████████████▎| 970/1000 [6:06:22<03:51,  7.72s/it]                                                               {'loss': 2.3351, 'grad_norm': 1.0874335765838623, 'learning_rate': 5.035989486508075e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 579.44, 'epoch': 0.97}
- 97%|████████████████████▎| 970/1000 [6:06:22<03:51,  7.72s/it] 97%|████████████████████▍| 971/1000 [6:06:30<03:43,  7.71s/it]                                                               {'loss': 2.2922, 'grad_norm': 0.791401743888855, 'learning_rate': 4.7165788333860536e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1109.94, 'epoch': 0.97}
- 97%|████████████████████▍| 971/1000 [6:06:30<03:43,  7.71s/it] 97%|████████████████████▍| 972/1000 [6:06:37<03:35,  7.71s/it]                                                               {'loss': 2.4829, 'grad_norm': 0.8731902241706848, 'learning_rate': 4.4076082323920576e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 877.97, 'epoch': 0.97}
- 97%|████████████████████▍| 972/1000 [6:06:37<03:35,  7.71s/it] 97%|████████████████████▍| 973/1000 [6:06:45<03:28,  7.71s/it]                                                               {'loss': 2.2317, 'grad_norm': 0.8560281991958618, 'learning_rate': 4.1090809244814785e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 932.27, 'epoch': 0.97}
- 97%|████████████████████▍| 973/1000 [6:06:45<03:28,  7.71s/it] 97%|████████████████████▍| 974/1000 [6:06:53<03:20,  7.70s/it]                                                               {'loss': 2.5177, 'grad_norm': 0.9274902939796448, 'learning_rate': 3.82100004106456e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 967.07, 'epoch': 0.97}
- 97%|████████████████████▍| 974/1000 [6:06:53<03:20,  7.70s/it] 98%|████████████████████▍| 975/1000 [6:07:00<03:12,  7.70s/it]                                                               {'loss': 2.3584, 'grad_norm': 0.9513389468193054, 'learning_rate': 3.543368603973529e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 776.96, 'epoch': 0.97}
- 98%|████████████████████▍| 975/1000 [6:07:00<03:12,  7.70s/it] 98%|████████████████████▍| 976/1000 [6:07:08<03:04,  7.70s/it]                                                               {'loss': 2.1989, 'grad_norm': 0.8030345439910889, 'learning_rate': 3.2761895254306287e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1064.08, 'epoch': 0.98}
- 98%|████████████████████▍| 976/1000 [6:07:08<03:04,  7.70s/it] 98%|████████████████████▌| 977/1000 [6:07:16<02:57,  7.70s/it]                                                               {'loss': 2.3525, 'grad_norm': 0.8223397135734558, 'learning_rate': 3.019465608018024e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1110.89, 'epoch': 0.98}
- 98%|████████████████████▌| 977/1000 [6:07:16<02:57,  7.70s/it] 98%|████████████████████▌| 978/1000 [6:07:24<02:49,  7.70s/it]                                                               {'loss': 2.336, 'grad_norm': 1.0492770671844482, 'learning_rate': 2.773199544648164e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 779.71, 'epoch': 0.98}
- 98%|████████████████████▌| 978/1000 [6:07:24<02:49,  7.70s/it] 98%|████████████████████▌| 979/1000 [6:07:31<02:41,  7.69s/it]                                                               {'loss': 2.357, 'grad_norm': 0.897686779499054, 'learning_rate': 2.537393918535358e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 980.21, 'epoch': 0.98}
- 98%|████████████████████▌| 979/1000 [6:07:31<02:41,  7.69s/it] 98%|████████████████████▌| 980/1000 [6:07:39<02:33,  7.70s/it]                                                               {'loss': 2.176, 'grad_norm': 0.8448941707611084, 'learning_rate': 2.312051203169352e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1001.24, 'epoch': 0.98}
- 98%|████████████████████▌| 980/1000 [6:07:39<02:33,  7.70s/it] 98%|████████████████████▌| 981/1000 [6:07:47<02:26,  7.69s/it]                                                               {'loss': 2.2181, 'grad_norm': 0.9005848169326782, 'learning_rate': 2.0971737622883515e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 900.62, 'epoch': 0.98}
- 98%|████████████████████▌| 981/1000 [6:07:47<02:26,  7.69s/it] 98%|████████████████████▌| 982/1000 [6:07:54<02:18,  7.69s/it]                                                               {'loss': 2.2886, 'grad_norm': 0.8972439169883728, 'learning_rate': 1.8927638498551502e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 875.01, 'epoch': 0.98}
- 98%|████████████████████▌| 982/1000 [6:07:54<02:18,  7.69s/it] 98%|████████████████████▋| 983/1000 [6:08:02<02:10,  7.69s/it]                                                               {'loss': 2.2567, 'grad_norm': 0.8892665505409241, 'learning_rate': 1.6988236100329292e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 844.42, 'epoch': 0.98}
- 98%|████████████████████▋| 983/1000 [6:08:02<02:10,  7.69s/it] 98%|████████████████████▋| 984/1000 [6:08:10<02:03,  7.70s/it]                                                               {'loss': 2.3351, 'grad_norm': 0.915696918964386, 'learning_rate': 1.5153550771630498e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 815.64, 'epoch': 0.98}
- 98%|████████████████████▋| 984/1000 [6:08:10<02:03,  7.70s/it] 98%|████████████████████▋| 985/1000 [6:08:17<01:55,  7.71s/it]                                                               {'loss': 2.2343, 'grad_norm': 0.9981441497802734, 'learning_rate': 1.3423601757436287e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 702.43, 'epoch': 0.98}
- 98%|████████████████████▋| 985/1000 [6:08:17<01:55,  7.71s/it] 99%|████████████████████▋| 986/1000 [6:08:25<01:47,  7.70s/it]                                                               {'loss': 2.192, 'grad_norm': 0.8215169906616211, 'learning_rate': 1.179840720409331e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1012.4, 'epoch': 0.99}
- 99%|████████████████████▋| 986/1000 [6:08:25<01:47,  7.70s/it] 99%|████████████████████▋| 987/1000 [6:08:33<01:40,  7.70s/it]                                                               {'loss': 2.3544, 'grad_norm': 1.0433471202850342, 'learning_rate': 1.0277984159122733e-07, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 829.67, 'epoch': 0.99}
- 99%|████████████████████▋| 987/1000 [6:08:33<01:40,  7.70s/it] 99%|████████████████████▋| 988/1000 [6:08:40<01:32,  7.70s/it]                                                               {'loss': 2.3737, 'grad_norm': 0.8312088847160339, 'learning_rate': 8.862348571043733e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1123.09, 'epoch': 0.99}
- 99%|████████████████████▋| 988/1000 [6:08:41<01:32,  7.70s/it] 99%|████████████████████▊| 989/1000 [6:08:48<01:24,  7.70s/it]                                                               {'loss': 2.0985, 'grad_norm': 1.0085126161575317, 'learning_rate': 7.551515289203615e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 642.57, 'epoch': 0.99}
- 99%|████████████████████▊| 989/1000 [6:08:48<01:24,  7.70s/it] 99%|████████████████████▊| 990/1000 [6:08:56<01:17,  7.70s/it]                                                               {'loss': 2.5521, 'grad_norm': 0.9324679970741272, 'learning_rate': 6.34549806362239e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 979.2, 'epoch': 0.99}
- 99%|████████████████████▊| 990/1000 [6:08:56<01:17,  7.70s/it] 99%|████████████████████▊| 991/1000 [6:09:04<01:09,  7.70s/it]                                                               {'loss': 2.2688, 'grad_norm': 0.8679972290992737, 'learning_rate': 5.2443095448506674e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 857.33, 'epoch': 0.99}
- 99%|████████████████████▊| 991/1000 [6:09:04<01:09,  7.70s/it] 99%|████████████████████▊| 992/1000 [6:09:11<01:01,  7.70s/it]                                                               {'loss': 2.2254, 'grad_norm': 0.8510658740997314, 'learning_rate': 4.247961283835311e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 952.28, 'epoch': 0.99}
- 99%|████████████████████▊| 992/1000 [6:09:11<01:01,  7.70s/it] 99%|████████████████████▊| 993/1000 [6:09:19<00:53,  7.69s/it]                                                               {'loss': 2.3777, 'grad_norm': 0.8851034641265869, 'learning_rate': 3.356463731798432e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 930.35, 'epoch': 0.99}
- 99%|████████████████████▊| 993/1000 [6:09:19<00:53,  7.69s/it] 99%|████████████████████▊| 994/1000 [6:09:27<00:46,  7.70s/it]                                                               {'loss': 2.501, 'grad_norm': 0.847767174243927, 'learning_rate': 2.5698262401263605e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 1058.39, 'epoch': 0.99}
- 99%|████████████████████▊| 994/1000 [6:09:27<00:46,  7.70s/it]100%|████████████████████▉| 995/1000 [6:09:34<00:38,  7.70s/it]                                                               {'loss': 2.0256, 'grad_norm': 0.8249082565307617, 'learning_rate': 1.888057060274173e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 914.1, 'epoch': 0.99}
-100%|████████████████████▉| 995/1000 [6:09:34<00:38,  7.70s/it]100%|████████████████████▉| 996/1000 [6:09:42<00:30,  7.70s/it]                                                               {'loss': 2.3413, 'grad_norm': 1.0485869646072388, 'learning_rate': 1.3111633436779791e-08, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 546.19, 'epoch': 1.0}
-100%|████████████████████▉| 996/1000 [6:09:42<00:30,  7.70s/it]100%|████████████████████▉| 997/1000 [6:09:50<00:23,  7.70s/it]                                                               {'loss': 2.4325, 'grad_norm': 0.923591136932373, 'learning_rate': 8.391511416816489e-09, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 844.28, 'epoch': 1.0}
-100%|████████████████████▉| 997/1000 [6:09:50<00:23,  7.70s/it]100%|████████████████████▉| 998/1000 [6:09:57<00:15,  7.70s/it]                                                               {'loss': 2.2315, 'grad_norm': 1.0989410877227783, 'learning_rate': 4.720254054679796e-09, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 518.76, 'epoch': 1.0}
-100%|████████████████████▉| 998/1000 [6:09:57<00:15,  7.70s/it]100%|████████████████████▉| 999/1000 [6:10:05<00:07,  7.69s/it]                                                               {'loss': 2.5925, 'grad_norm': 1.190458059310913, 'learning_rate': 2.0978998601206556e-09, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 618.02, 'epoch': 1.0}
-100%|████████████████████▉| 999/1000 [6:10:05<00:07,  7.69s/it]100%|████████████████████| 1000/1000 [6:10:13<00:00,  7.70s/it]                                                               {'loss': 2.2586, 'grad_norm': 0.9831822514533997, 'learning_rate': 5.244763404133046e-10, 'memory/max_active (GiB)': 17.43, 'memory/max_allocated (GiB)': 17.43, 'memory/device_reserved (GiB)': 17.79, 'tokens_per_second_per_gpu': 740.06, 'epoch': 1.0}
-100%|████████████████████| 1000/1000 [6:10:13<00:00,  7.70s/it][2025-10-19 01:13:00,005] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:42363] Running evaluation step...
-[2025-10-19 01:13:03,113] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.5175468921661377
-[2025-10-19 01:13:04,532] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.4183826446533203
-[2025-10-19 01:13:05,833] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3006186485290527
-[2025-10-19 01:13:07,214] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42363] generate_batches time: 1.3808095455169678
-[2025-10-19 01:13:07,215] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42363] gather_len_batches: [179]
-
-  0%|                                  | 0/179 [00:00<?, ?it/s][A
-  1%|▎                         | 2/179 [00:00<00:28,  6.19it/s][A
-  2%|▍                         | 3/179 [00:00<00:40,  4.37it/s][A
-  2%|▌                         | 4/179 [00:00<00:46,  3.77it/s][A
-  3%|▋                         | 5/179 [00:01<01:17,  2.23it/s][A
-  3%|▊                         | 6/179 [00:02<01:08,  2.53it/s][A
-  4%|█                         | 7/179 [00:02<01:04,  2.67it/s][A
-  4%|█▏                        | 8/179 [00:02<01:01,  2.80it/s][A
-  5%|█▎                        | 9/179 [00:03<01:09,  2.44it/s][A
-  6%|█▍                       | 10/179 [00:03<01:03,  2.67it/s][A
-  6%|█▌                       | 11/179 [00:03<01:00,  2.80it/s][A
-  7%|█▋                       | 12/179 [00:04<00:57,  2.89it/s][A
-  7%|█▊                       | 13/179 [00:04<01:06,  2.49it/s][A
-  8%|█▉                       | 14/179 [00:04<01:00,  2.72it/s][A
-  8%|██                       | 15/179 [00:05<00:58,  2.83it/s][A
-  9%|██▏                      | 16/179 [00:05<00:56,  2.90it/s][A
-  9%|██▎                      | 17/179 [00:06<01:04,  2.49it/s][A
- 10%|██▌                      | 18/179 [00:06<00:59,  2.73it/s][A
- 11%|██▋                      | 19/179 [00:06<00:56,  2.84it/s][A
- 11%|██▊                      | 20/179 [00:07<00:54,  2.92it/s][A
- 12%|██▉                      | 21/179 [00:07<01:02,  2.51it/s][A
- 12%|███                      | 22/179 [00:07<00:57,  2.73it/s][A
- 13%|███▏                     | 23/179 [00:08<00:54,  2.84it/s][A
- 13%|███▎                     | 24/179 [00:08<00:53,  2.92it/s][A
- 14%|███▍                     | 25/179 [00:09<01:01,  2.51it/s][A
- 15%|███▋                     | 26/179 [00:09<00:55,  2.73it/s][A
- 15%|███▊                     | 27/179 [00:09<00:53,  2.84it/s][A
- 16%|███▉                     | 28/179 [00:10<00:51,  2.92it/s][A
- 16%|████                     | 29/179 [00:10<00:59,  2.51it/s][A
- 17%|████▏                    | 30/179 [00:10<00:54,  2.74it/s][A
- 17%|████▎                    | 31/179 [00:11<00:51,  2.85it/s][A
- 18%|████▍                    | 32/179 [00:11<00:50,  2.93it/s][A
- 18%|████▌                    | 33/179 [00:11<00:57,  2.52it/s][A
- 19%|████▋                    | 34/179 [00:12<00:52,  2.74it/s][A
- 20%|████▉                    | 35/179 [00:12<00:50,  2.85it/s][A
- 20%|█████                    | 36/179 [00:12<00:48,  2.93it/s][A
- 21%|█████▏                   | 37/179 [00:13<00:56,  2.51it/s][A
- 21%|█████▎                   | 38/179 [00:13<00:51,  2.74it/s][A
- 22%|█████▍                   | 39/179 [00:14<00:49,  2.85it/s][A
- 22%|█████▌                   | 40/179 [00:14<00:47,  2.93it/s][A
- 23%|█████▋                   | 41/179 [00:14<00:54,  2.52it/s][A
- 23%|█████▊                   | 42/179 [00:15<00:49,  2.75it/s][A
- 24%|██████                   | 43/179 [00:15<00:47,  2.85it/s][A
- 25%|██████▏                  | 44/179 [00:15<00:46,  2.93it/s][A
- 25%|██████▎                  | 45/179 [00:16<00:53,  2.51it/s][A
- 26%|██████▍                  | 46/179 [00:16<00:48,  2.74it/s][A
- 26%|██████▌                  | 47/179 [00:16<00:46,  2.85it/s][A
- 27%|██████▋                  | 48/179 [00:17<00:44,  2.93it/s][A
- 27%|██████▊                  | 49/179 [00:17<00:51,  2.52it/s][A
- 28%|██████▉                  | 50/179 [00:18<00:47,  2.74it/s][A
- 28%|███████                  | 51/179 [00:18<00:45,  2.84it/s][A
- 29%|███████▎                 | 52/179 [00:18<00:43,  2.92it/s][A
- 30%|███████▍                 | 53/179 [00:19<00:50,  2.51it/s][A
- 30%|███████▌                 | 54/179 [00:19<00:45,  2.73it/s][A
- 31%|███████▋                 | 55/179 [00:19<00:43,  2.84it/s][A
- 31%|███████▊                 | 56/179 [00:20<00:42,  2.92it/s][A
- 32%|███████▉                 | 57/179 [00:20<00:48,  2.51it/s][A
- 32%|████████                 | 58/179 [00:21<00:44,  2.74it/s][A
- 33%|████████▏                | 59/179 [00:21<00:42,  2.85it/s][A
- 34%|████████▍                | 60/179 [00:21<00:40,  2.92it/s][A
- 34%|████████▌                | 61/179 [00:22<00:46,  2.52it/s][A
- 35%|████████▋                | 62/179 [00:22<00:42,  2.73it/s][A
- 35%|████████▊                | 63/179 [00:22<00:40,  2.84it/s][A
- 36%|████████▉                | 64/179 [00:23<00:39,  2.92it/s][A
- 36%|█████████                | 65/179 [00:23<00:45,  2.51it/s][A
- 37%|█████████▏               | 66/179 [00:23<00:41,  2.74it/s][A
- 37%|█████████▎               | 67/179 [00:24<00:39,  2.84it/s][A
- 38%|█████████▍               | 68/179 [00:24<00:37,  2.92it/s][A
- 39%|█████████▋               | 69/179 [00:25<00:43,  2.51it/s][A
- 39%|█████████▊               | 70/179 [00:25<00:39,  2.73it/s][A
- 40%|█████████▉               | 71/179 [00:25<00:37,  2.84it/s][A
- 40%|██████████               | 72/179 [00:26<00:36,  2.92it/s][A
- 41%|██████████▏              | 73/179 [00:26<00:42,  2.51it/s][A
- 41%|██████████▎              | 74/179 [00:26<00:38,  2.74it/s][A
- 42%|██████████▍              | 75/179 [00:27<00:36,  2.85it/s][A
- 42%|██████████▌              | 76/179 [00:27<00:35,  2.93it/s][A
- 43%|██████████▊              | 77/179 [00:28<00:44,  2.28it/s][A
- 44%|██████████▉              | 78/179 [00:28<00:39,  2.53it/s][A
- 44%|███████████              | 79/179 [00:28<00:37,  2.68it/s][A
- 45%|███████████▏             | 80/179 [00:29<00:35,  2.80it/s][A
- 45%|███████████▎             | 81/179 [00:29<00:39,  2.45it/s][A
- 46%|███████████▍             | 82/179 [00:29<00:36,  2.69it/s][A
- 46%|███████████▌             | 83/179 [00:30<00:34,  2.80it/s][A
- 47%|███████████▋             | 84/179 [00:30<00:32,  2.89it/s][A
- 47%|███████████▊             | 85/179 [00:31<00:43,  2.18it/s][A
- 48%|████████████             | 86/179 [00:31<00:37,  2.45it/s][A
- 49%|████████████▏            | 87/179 [00:31<00:35,  2.63it/s][A
- 49%|████████████▎            | 88/179 [00:32<00:32,  2.76it/s][A
- 50%|████████████▍            | 89/179 [00:32<00:37,  2.43it/s][A
- 50%|████████████▌            | 90/179 [00:32<00:33,  2.67it/s][A
- 51%|████████████▋            | 91/179 [00:33<00:31,  2.79it/s][A
- 51%|████████████▊            | 92/179 [00:33<00:30,  2.88it/s][A
- 52%|████████████▉            | 93/179 [00:34<00:34,  2.50it/s][A
- 53%|█████████████▏           | 94/179 [00:34<00:31,  2.73it/s][A
- 53%|█████████████▎           | 95/179 [00:34<00:29,  2.84it/s][A
- 54%|█████████████▍           | 96/179 [00:35<00:28,  2.91it/s][A
- 54%|█████████████▌           | 97/179 [00:35<00:32,  2.51it/s][A
- 55%|█████████████▋           | 98/179 [00:35<00:29,  2.73it/s][A
- 55%|█████████████▊           | 99/179 [00:36<00:28,  2.83it/s][A
- 56%|█████████████▍          | 100/179 [00:36<00:27,  2.91it/s][A
- 56%|█████████████▌          | 101/179 [00:37<00:31,  2.51it/s][A
- 57%|█████████████▋          | 102/179 [00:37<00:28,  2.73it/s][A
- 58%|█████████████▊          | 103/179 [00:37<00:26,  2.84it/s][A
- 58%|█████████████▉          | 104/179 [00:38<00:25,  2.91it/s][A
- 59%|██████████████          | 105/179 [00:38<00:29,  2.50it/s][A
- 59%|██████████████▏         | 106/179 [00:38<00:26,  2.73it/s][A
- 60%|██████████████▎         | 107/179 [00:39<00:25,  2.83it/s][A
- 60%|██████████████▍         | 108/179 [00:39<00:24,  2.92it/s][A
- 61%|██████████████▌         | 109/179 [00:39<00:27,  2.52it/s][A
- 61%|██████████████▋         | 110/179 [00:40<00:25,  2.74it/s][A
- 62%|██████████████▉         | 111/179 [00:40<00:23,  2.85it/s][A
- 63%|███████████████         | 112/179 [00:40<00:22,  2.92it/s][A
- 63%|███████████████▏        | 113/179 [00:41<00:26,  2.51it/s][A
- 64%|███████████████▎        | 114/179 [00:41<00:23,  2.73it/s][A
- 64%|███████████████▍        | 115/179 [00:42<00:22,  2.84it/s][A
- 65%|███████████████▌        | 116/179 [00:42<00:21,  2.92it/s][A
- 65%|███████████████▋        | 117/179 [00:42<00:24,  2.51it/s][A
- 66%|███████████████▊        | 118/179 [00:43<00:22,  2.73it/s][A
- 66%|███████████████▉        | 119/179 [00:43<00:21,  2.84it/s][A
- 67%|████████████████        | 120/179 [00:43<00:20,  2.92it/s][A
- 68%|████████████████▏       | 121/179 [00:44<00:23,  2.52it/s][A
- 68%|████████████████▎       | 122/179 [00:44<00:20,  2.74it/s][A
- 69%|████████████████▍       | 123/179 [00:44<00:19,  2.85it/s][A
- 69%|████████████████▋       | 124/179 [00:45<00:18,  2.92it/s][A
- 70%|████████████████▊       | 125/179 [00:45<00:21,  2.52it/s][A
- 70%|████████████████▉       | 126/179 [00:46<00:19,  2.73it/s][A
- 71%|█████████████████       | 127/179 [00:46<00:18,  2.84it/s][A
- 72%|█████████████████▏      | 128/179 [00:46<00:17,  2.91it/s][A
- 72%|█████████████████▎      | 129/179 [00:47<00:19,  2.51it/s][A
- 73%|█████████████████▍      | 130/179 [00:47<00:17,  2.73it/s][A
- 73%|█████████████████▌      | 131/179 [00:47<00:16,  2.84it/s][A
- 74%|█████████████████▋      | 132/179 [00:48<00:16,  2.91it/s][A
- 74%|█████████████████▊      | 133/179 [00:48<00:18,  2.50it/s][A
- 75%|█████████████████▉      | 134/179 [00:49<00:16,  2.72it/s][A
- 75%|██████████████████      | 135/179 [00:49<00:15,  2.83it/s][A
- 76%|██████████████████▏     | 136/179 [00:49<00:14,  2.91it/s][A
- 77%|██████████████████▎     | 137/179 [00:50<00:16,  2.51it/s][A
- 77%|██████████████████▌     | 138/179 [00:50<00:14,  2.74it/s][A
- 78%|██████████████████▋     | 139/179 [00:50<00:14,  2.83it/s][A
- 78%|██████████████████▊     | 140/179 [00:51<00:13,  2.91it/s][A
- 79%|██████████████████▉     | 141/179 [00:51<00:15,  2.51it/s][A
- 79%|███████████████████     | 142/179 [00:51<00:13,  2.73it/s][A
- 80%|███████████████████▏    | 143/179 [00:52<00:12,  2.83it/s][A
- 80%|███████████████████▎    | 144/179 [00:52<00:12,  2.91it/s][A
- 81%|███████████████████▍    | 145/179 [00:53<00:13,  2.50it/s][A
- 82%|███████████████████▌    | 146/179 [00:53<00:12,  2.73it/s][A
- 82%|███████████████████▋    | 147/179 [00:53<00:11,  2.84it/s][A
- 83%|███████████████████▊    | 148/179 [00:54<00:10,  2.92it/s][A
- 83%|███████████████████▉    | 149/179 [00:54<00:11,  2.51it/s][A
- 84%|████████████████████    | 150/179 [00:54<00:10,  2.74it/s][A
- 84%|████████████████████▏   | 151/179 [00:55<00:09,  2.84it/s][A
- 85%|████████████████████▍   | 152/179 [00:55<00:09,  2.92it/s][A
- 85%|████████████████████▌   | 153/179 [00:56<00:10,  2.52it/s][A
- 86%|████████████████████▋   | 154/179 [00:56<00:09,  2.74it/s][A
- 87%|████████████████████▊   | 155/179 [00:56<00:08,  2.85it/s][A
- 87%|████████████████████▉   | 156/179 [00:56<00:07,  2.92it/s][A
- 88%|█████████████████████   | 157/179 [00:57<00:08,  2.51it/s][A
- 88%|█████████████████████▏  | 158/179 [00:57<00:07,  2.74it/s][A
- 89%|█████████████████████▎  | 159/179 [00:58<00:07,  2.85it/s][A
- 89%|█████████████████████▍  | 160/179 [00:58<00:06,  2.92it/s][A
- 90%|█████████████████████▌  | 161/179 [00:58<00:07,  2.51it/s][A
- 91%|█████████████████████▋  | 162/179 [00:59<00:06,  2.74it/s][A
- 91%|█████████████████████▊  | 163/179 [00:59<00:05,  2.85it/s][A
- 92%|█████████████████████▉  | 164/179 [00:59<00:05,  2.93it/s][A
- 92%|██████████████████████  | 165/179 [01:00<00:05,  2.51it/s][A
- 93%|██████████████████████▎ | 166/179 [01:00<00:04,  2.73it/s][A
- 93%|██████████████████████▍ | 167/179 [01:01<00:04,  2.84it/s][A
- 94%|██████████████████████▌ | 168/179 [01:01<00:03,  2.92it/s][A
- 94%|██████████████████████▋ | 169/179 [01:01<00:03,  2.52it/s][A
- 95%|██████████████████████▊ | 170/179 [01:02<00:03,  2.75it/s][A
- 96%|██████████████████████▉ | 171/179 [01:02<00:02,  2.85it/s][A
- 96%|███████████████████████ | 172/179 [01:02<00:02,  2.92it/s][A
- 97%|███████████████████████▏| 173/179 [01:03<00:02,  2.52it/s][A
- 97%|███████████████████████▎| 174/179 [01:03<00:01,  2.74it/s][A
- 98%|███████████████████████▍| 175/179 [01:03<00:01,  2.85it/s][A
- 98%|███████████████████████▌| 176/179 [01:04<00:01,  2.92it/s][A
- 99%|███████████████████████▋| 177/179 [01:04<00:00,  2.52it/s][A
- 99%|███████████████████████▊| 178/179 [01:05<00:00,  2.74it/s][A
-100%|████████████████████████| 179/179 [01:05<00:00,  2.61it/s][A                                                               
-                                                               [A{'eval_loss': 2.245497226715088, 'eval_runtime': 67.8857, 'eval_samples_per_second': 2.887, 'eval_steps_per_second': 1.444, 'memory/max_active (GiB)': 7.78, 'memory/max_allocated (GiB)': 7.78, 'memory/device_reserved (GiB)': 17.79, 'epoch': 1.0}
-100%|████████████████████| 1000/1000 [6:11:28<00:00,  7.70s/it]
-100%|████████████████████████| 179/179 [01:05<00:00,  2.61it/s][A
-                                                               [A[2025-10-19 01:14:15,108] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-1000
-                                                               {'train_runtime': 22293.2666, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.045, 'train_loss': 2.4834489517211913, 'memory/max_active (GiB)': 2.61, 'memory/max_allocated (GiB)': 2.61, 'memory/device_reserved (GiB)': 8.84, 'epoch': 1.0}
-100%|████████████████████| 1000/1000 [6:11:30<00:00,  7.70s/it]100%|████████████████████| 1000/1000 [6:11:30<00:00, 22.29s/it]
-[2025-10-19 01:14:39,753] [INFO] [axolotl.train.save_trained_model:218] [PID:42363] Training completed! Saving trained model to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora.
-[2025-10-19 01:14:40,608] [INFO] [axolotl.train.save_trained_model:336] [PID:42363] Model successfully saved to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora
-[2025-10-19 01:14:41,005] [INFO] [axolotl.core.trainers.base._save:664] [PID:42363] Saving model checkpoint to ./outputs/sft/gemma-2-2b-it-rp-sft-qlora
-Processing Files (0 / 0)      : | |  0.00B /  0.00B            
-New Data Upload               : | |  0.00B /  0.00B            [A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  18%|▏| 58.7MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  18%|▏| 58.7MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  26%|▎| 97.3MB /  371MB,   ???B
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  35%|▎|  117MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  42%|▍|  156MB /  371MB,  293MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  50%|▌|  168MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  56%|▌|  206MB /  371MB,  272MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  66%|▋|  218MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  69%|▋|  257MB /  371MB,  266MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  81%|▊|  268MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  83%|▊|  307MB /  371MB,  262MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors:  96%|▉|  319MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (3 / 4)      :  96%|▉|  357MB /  371MB,  260MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (4 / 4)      : 100%|█|  371MB /  371MB,  228MB
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[A
-
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        [A[A
-
-
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        [A[A[A
-
-
-
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        [A[A[A[A
-
-
-
-
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        [A[A[A[A[AProcessing Files (4 / 4)      : 100%|█|  371MB /  371MB,  137MB
-New Data Upload               : | |  0.00B /  0.00B,  0.00B/s  
-  ...t-qlora/training_args.bin: 100%|█| 7.38kB / 7.38kB        
-  ...adapter_model.safetensors: 100%|█|  332MB /  332MB        
-  ...sft-qlora/tokenizer.model: 100%|█| 4.24MB / 4.24MB        
-  ...-sft-qlora/tokenizer.json: 100%|█| 34.4MB / 34.4MB        
\ No newline at end of file
+[2026-03-30 14:34:28,113] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:37135] min_input_len: 181
+[2026-03-30 14:34:28,114] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:37135] max_input_len: 103499
+Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204Dropping Invalid Sequences (<None or >204
+[2026-03-30 14:34:28,940] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:37135] Dropped 800 sequences outside valid range ([None, 2048])
+Drop Samples with Zero Trainable Tokens (Drop Samples with Zero Trainable Tokens (Drop Samples with Zero Trainable Tokens (Drop Samples with Zero Trainable Tokens (
+Saving the dataset (0/1 shards):   0%| | Saving the dataset (0/1 shards): 100%|█| Saving the dataset (1/1 shards): 100%|█| Saving the dataset (1/1 shards): 100%|█| 
+[2026-03-30 14:34:30,705] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:37135] total_num_tokens: 24_920_885
+[2026-03-30 14:34:45,899] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:37135] `total_supervised_tokens: 246_408_026`
+[2026-03-30 14:34:45,906] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:37135] total_num_steps: 2051
+[2026-03-30 14:34:45,907] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:37135] Maximum number of steps set at 1000
+[2026-03-30 14:34:46,033] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:37135] loading tokenizer... google/gemma-2-2b-it
+[2026-03-30 14:34:48,013] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:37135] EOS: 1 / <eos>
+[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:37135] BOS: 2 / <bos>
+[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:37135] PAD: 0 / <pad>
+[2026-03-30 14:34:48,014] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:37135] UNK: 3 / <unk>
+[2026-03-30 14:34:48,015] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:37135] Loading model
+[2026-03-30 14:34:48,126] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:37135] Patched Trainer.evaluation_loop with nanmean loss calculation
+[2026-03-30 14:34:48,127] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:37135] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
+[2026-03-30 14:34:48,180] [INFO] [axolotl.monkeypatch.attention.flash_attn_4.patch_flash_attn_4:52] [PID:37135] Flash Attention 4 is available for your GPU and offers faster training speeds. To enable: pip install flash-attn-4
+[2026-03-30 14:34:48,180] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:360] [PID:37135] Cannot patch self-attention - requires no dropout
+Loading weights:   0%| | 0/288 [00:00<?, Loading weights:   0%| | 1/288 [00:04<19:Loading weights:   1%| | 3/288 [00:04<05:Loading weights:   1%| | 4/288 [00:04<03:Loading weights:   2%| | 5/288 [00:04<02:Loading weights:   4%| | 11/288 [00:04<00Loading weights:   5%| | 14/288 [00:04<00Loading weights:   6%| | 17/288 [00:05<00Loading weights:   8%| | 24/288 [00:05<00Loading weights:  10%| | 28/288 [00:05<00Loading weights:  12%| | 34/288 [00:05<00Loading weights:  13%|▏| 38/288 [00:06<00Loading weights:  16%|▏| 47/288 [00:06<00Loading weights:  18%|▏| 51/288 [00:06<00Loading weights:  19%|▏| 56/288 [00:06<00Loading weights:  21%|▏| 60/288 [00:07<00Loading weights:  24%|▏| 69/288 [00:07<00Loading weights:  25%|▎| 72/288 [00:07<00Loading weights:  28%|▎| 80/288 [00:08<00Loading weights:  29%|▎| 83/288 [00:08<00Loading weights:  31%|▎| 89/288 [00:08<00Loading weights:  32%|▎| 93/288 [00:09<00Loading weights:  34%|▎| 99/288 [00:09<00Loading weights:  36%|▎| 103/288 [00:09<0Loading weights:  37%|▎| 106/288 [00:09<0Loading weights:  39%|▍| 113/288 [00:09<0Loading weights:  40%|▍| 116/288 [00:10<0Loading weights:  43%|▍| 124/288 [00:10<0Loading weights:  44%|▍| 127/288 [00:10<0Loading weights:  47%|▍| 135/288 [00:11<0Loading weights:  48%|▍| 138/288 [00:11<0Loading weights:  50%|▌| 144/288 [00:11<0Loading weights:  51%|▌| 148/288 [00:11<0Loading weights:  53%|▌| 154/288 [00:12<0Loading weights:  55%|▌| 158/288 [00:12<0Loading weights:  56%|▌| 161/288 [00:12<0Loading weights:  58%|▌| 168/288 [00:12<0Loading weights:  59%|▌| 171/288 [00:13<0Loading weights:  61%|▌| 177/288 [00:13<0Loading weights:  63%|▋| 181/288 [00:13<0Loading weights:  66%|▋| 190/288 [00:13<0Loading weights:  67%|▋| 194/288 [00:14<0Loading weights:  70%|▋| 201/288 [00:14<0Loading weights:  71%|▋| 204/288 [00:14<0Loading weights:  74%|▋| 212/288 [00:14<0Loading weights:  75%|▋| 215/288 [00:15<0Loading weights:  77%|▊| 223/288 [00:15<0Loading weights:  78%|▊| 226/288 [00:15<0Loading weights:  81%|▊| 232/288 [00:15<0Loading weights:  82%|▊| 235/288 [00:16<0Loading weights:  83%|▊| 238/288 [00:16<0Loading weights:  84%|▊| 243/288 [00:16<0Loading weights:  86%|▊| 247/288 [00:16<0Loading weights:  88%|▉| 254/288 [00:16<0Loading weights:  90%|▉| 258/288 [00:17<0Loading weights:  93%|▉| 267/288 [00:17<0Loading weights:  94%|▉| 270/288 [00:17<0Loading weights:  95%|▉| 275/288 [00:18<0Loading weights:  97%|▉| 278/288 [00:18<0Loading weights:  98%|▉| 281/288 [00:18<0Loading weights: 100%|█| 288/288 [00:18<0
+[2026-03-30 14:35:07,907] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:892] [PID:37135] converting PEFT model w/ prepare_model_for_kbit_training
+[2026-03-30 14:35:07,909] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:37135] Converting modules to torch.bfloat16
+[2026-03-30 14:35:07,912] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:37135] Memory usage after model load 4.273GB (+4.273GB allocated, +4.320GB reserved)
+trainable params: 25,559,040 || all params: 2,639,900,928 || trainable%: 0.9682
+[2026-03-30 14:35:08,057] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:37135] after adapters 2.170GB (+2.170GB allocated, +4.424GB reserved)
+[2026-03-30 14:35:09,370] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:376] [PID:37135] LoRA kernels: dropout=0.05 enabled
+[2026-03-30 14:35:09,370] [WARNING] [axolotl.monkeypatch.lora_kernels.warning_once:46] [PID:37135] Cannot patch some MLP layers - requires LoRA adapters
+[2026-03-30 14:35:11,020] [INFO] [axolotl.train.save_initial_configs:417] [PID:37135] Pre-saving adapter config to /workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora...
+[2026-03-30 14:35:11,026] [INFO] [axolotl.train.save_initial_configs:421] [PID:37135] Pre-saving tokenizer to /workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora...
+[2026-03-30 14:35:11,396] [INFO] [axolotl.train.save_initial_configs:426] [PID:37135] Pre-saving model config to /workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora...
+[2026-03-30 14:35:11,411] [INFO] [axolotl.train.execute_training:222] [PID:37135] Starting trainer...
+[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
+[34m[1mwandb[0m: Currently logged in as: [33mfactoryaiart[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
+[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
+[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
+[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run Attention-Bloc...
+[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.25.1
+[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/axolotl/wandb/run-20260330_143512-Attention-Block-Only_Test[0m
+[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
+[34m[1mwandb[0m: Syncing run [33mAttention-Block-Only_Test[0m
+[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/factoryaiart/rp-sft[0m
+[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/factoryaiart/rp-sft/runs/Attention-Block-Only_Test[0m
+[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
+[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
+[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
+[2026-03-30 14:35:13,965] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:37135] The Axolotl config has been saved to the WandB run under files.
+  0%|           | 0/1000 [00:00<?, ?it/s][2026-03-30 14:35:13,969] [INFO] [axolotl.core.trainers.base.evaluate:401] [PID:37135] Running evaluation step...
+
+  0%|            | 0/100 [00:00<?, ?it/s][A
+  3%|    | 3/100 [00:00<00:03, 24.40it/s][A
+  6%|▏   | 6/100 [00:00<00:05, 17.69it/s][A
+  8%|▎   | 8/100 [00:00<00:05, 16.44it/s][A
+ 10%|▎  | 10/100 [00:00<00:05, 15.66it/s][A
+ 12%|▎  | 12/100 [00:00<00:05, 15.83it/s][A
+ 14%|▍  | 14/100 [00:00<00:05, 15.93it/s][A
+ 16%|▍  | 16/100 [00:00<00:05, 15.92it/s][A
+ 18%|▌  | 18/100 [00:01<00:05, 15.89it/s][A
+ 20%|▌  | 20/100 [00:01<00:04, 16.23it/s][A
+ 22%|▋  | 22/100 [00:01<00:05, 15.39it/s][A
+ 24%|▋  | 24/100 [00:01<00:04, 16.11it/s][A
+ 26%|▊  | 26/100 [00:01<00:04, 15.69it/s][A
+ 28%|▊  | 28/100 [00:01<00:04, 16.11it/s][A
+ 30%|▉  | 30/100 [00:01<00:04, 15.39it/s][A
+ 32%|▉  | 32/100 [00:02<00:04, 15.39it/s][A
+ 34%|█  | 34/100 [00:02<00:04, 15.87it/s][A
+ 36%|█  | 36/100 [00:02<00:03, 16.55it/s][A
+ 38%|█▏ | 38/100 [00:02<00:03, 16.33it/s][A
+ 40%|█▏ | 40/100 [00:02<00:03, 16.25it/s][A
+ 42%|█▎ | 42/100 [00:02<00:03, 16.14it/s][A
+ 44%|█▎ | 44/100 [00:02<00:03, 16.61it/s][A
+ 46%|█▍ | 46/100 [00:02<00:03, 15.74it/s][A
+ 48%|█▍ | 48/100 [00:02<00:03, 16.30it/s][A
+ 50%|█▌ | 50/100 [00:03<00:03, 15.97it/s][A
+ 52%|█▌ | 52/100 [00:03<00:02, 16.26it/s][A
+ 54%|█▌ | 54/100 [00:03<00:02, 15.73it/s][A
+ 56%|█▋ | 56/100 [00:03<00:02, 16.22it/s][A
+ 58%|█▋ | 58/100 [00:03<00:02, 16.12it/s][A
+ 60%|█▊ | 60/100 [00:03<00:02, 16.62it/s][A
+ 62%|█▊ | 62/100 [00:03<00:02, 16.41it/s][A
+ 64%|█▉ | 64/100 [00:03<00:02, 16.54it/s][A
+ 66%|█▉ | 66/100 [00:04<00:02, 16.06it/s][A
+ 68%|██ | 68/100 [00:04<00:01, 16.81it/s][A
+ 70%|██ | 70/100 [00:04<00:01, 16.32it/s][A
+ 72%|██▏| 72/100 [00:04<00:01, 16.96it/s][A
+ 74%|██▏| 74/100 [00:04<00:01, 15.40it/s][A
+ 76%|██▎| 76/100 [00:04<00:01, 16.50it/s][A
+ 78%|██▎| 78/100 [00:04<00:01, 16.24it/s][A
+ 80%|██▍| 80/100 [00:04<00:01, 16.70it/s][A
+ 82%|██▍| 82/100 [00:05<00:01, 16.35it/s][A
+ 85%|██▌| 85/100 [00:05<00:00, 17.05it/s][A
+ 87%|██▌| 87/100 [00:05<00:00, 17.58it/s][A
+ 89%|██▋| 89/100 [00:05<00:00, 17.39it/s][A
+ 91%|██▋| 91/100 [00:05<00:00, 17.32it/s][A
+ 93%|██▊| 93/100 [00:05<00:00, 16.38it/s][A
+ 95%|██▊| 95/100 [00:05<00:00, 16.03it/s][A
+ 97%|██▉| 97/100 [00:05<00:00, 16.11it/s][A
+100%|██| 100/100 [00:06<00:00, 16.39it/s][A                                         
+                                         [A{'eval_loss': '3.072', 'eval_runtime': '6.631', 'eval_samples_per_second': '30.16', 'eval_steps_per_second': '15.08', 'eval_ppl': '21.58', 'memory/max_active (GiB)': '11.71', 'memory/max_allocated (GiB)': '11.71', 'memory/device_reserved (GiB)': '26.44', 'epoch': 0}
+  0%|           | 0/1000 [00:06<?, ?it/s]
+100%|██| 100/100 [00:06<00:00, 16.39it/s][A
+                                         [A  0%| | 1/1000 [00:08<2:17:09,  8.24s/it]                                         {'loss': '2.989', 'grad_norm': '1.052', 'learning_rate': '0', 'ppl': '19.87', 'memory/max_active (GiB)': '16.03', 'memory/max_allocated (GiB)': '16.03', 'memory/device_reserved (GiB)': '27.1', 'tokens/train_per_sec_per_gpu': '1377', 'tokens/total': 14720, 'tokens/trainable': 4243, 'epoch': '0.0004877'}
+  0%| | 1/1000 [00:08<2:17:09,  8.24s/it]  0%| | 2/1000 [00:09<1:04:19,  3.87s/it]                                         {'loss': '3.182', 'grad_norm': '0.6385', 'learning_rate': '6.667e-06', 'ppl': '24.09', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '27.14', 'tokens/train_per_sec_per_gpu': '1534', 'tokens/total': 28544, 'tokens/trainable': 10481, 'epoch': '0.0009755'}
+  0%| | 2/1000 [00:09<1:04:19,  3.87s/it]  0%|   | 3/1000 [00:09<40:28,  2.44s/it]                                         {'loss': '3.084', 'grad_norm': '0.9915', 'learning_rate': '1.333e-05', 'ppl': '21.85', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '27.14', 'tokens/train_per_sec_per_gpu': '507.5', 'tokens/total': 40064, 'tokens/trainable': 14004, 'epoch': '0.001463'}
+  0%|   | 3/1000 [00:09<40:28,  2.44s/it]  0%|   | 4/1000 [00:10<29:14,  1.76s/it]                                         {'loss': '3.034', 'grad_norm': '0.6002', 'learning_rate': '2e-05', 'ppl': '20.78', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '27.14', 'tokens/train_per_sec_per_gpu': '3704', 'tokens/total': 53632, 'tokens/trainable': 20483, 'epoch': '0.001951'}
+  0%|   | 4/1000 [00:10<29:14,  1.76s/it]  0%|   | 5/1000 [00:11<23:07,  1.39s/it]                                         {'loss': '2.94', 'grad_norm': '0.6711', 'learning_rate': '2.667e-05', 'ppl': '18.92', 'memory/max_active (GiB)': '15.09', 'memory/max_allocated (GiB)': '15.09', 'memory/device_reserved (GiB)': '27.14', 'tokens/train_per_sec_per_gpu': '552.2', 'tokens/total': 67200, 'tokens/trainable': 24362, 'epoch': '0.002439'}
+  0%|   | 5/1000 [00:11<23:07,  1.39s/it]  1%|   | 6/1000 [00:11<19:19,  1.17s/it]                                         {'loss': '2.934', 'grad_norm': '0.4941', 'learning_rate': '3.333e-05', 'ppl': '18.8', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '513.5', 'tokens/total': 78208, 'tokens/trainable': 27273, 'epoch': '0.002926'}
+  1%|   | 6/1000 [00:11<19:19,  1.17s/it]  1%|   | 7/1000 [00:12<17:00,  1.03s/it]                                         {'loss': '3.076', 'grad_norm': '0.395', 'learning_rate': '4e-05', 'ppl': '21.67', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1343', 'tokens/total': 92288, 'tokens/trainable': 31669, 'epoch': '0.003414'}
+  1%|   | 7/1000 [00:12<17:00,  1.03s/it]  1%|   | 8/1000 [00:13<15:33,  1.06it/s]                                         {'loss': '2.749', 'grad_norm': '0.2582', 'learning_rate': '4.667e-05', 'ppl': '15.63', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2650', 'tokens/total': 106496, 'tokens/trainable': 38815, 'epoch': '0.003902'}
+  1%|   | 8/1000 [00:13<15:33,  1.06it/s]  1%|   | 9/1000 [00:14<14:37,  1.13it/s]                                         {'loss': '2.805', 'grad_norm': '0.2812', 'learning_rate': '5.333e-05', 'ppl': '16.53', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2848', 'tokens/total': 120960, 'tokens/trainable': 45214, 'epoch': '0.00439'}
+  1%|   | 9/1000 [00:14<14:37,  1.13it/s]  1%|  | 10/1000 [00:15<14:09,  1.17it/s]                                         {'loss': '2.829', 'grad_norm': '0.2755', 'learning_rate': '6e-05', 'ppl': '16.94', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '4322', 'tokens/total': 136192, 'tokens/trainable': 53558, 'epoch': '0.004877'}
+  1%|  | 10/1000 [00:15<14:09,  1.17it/s]  1%|  | 11/1000 [00:15<13:43,  1.20it/s]                                         {'loss': '2.873', 'grad_norm': '0.3161', 'learning_rate': '6.667e-05', 'ppl': '17.69', 'memory/max_active (GiB)': '16.42', 'memory/max_allocated (GiB)': '16.42', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2026', 'tokens/total': 151040, 'tokens/trainable': 60241, 'epoch': '0.005365'}
+  1%|  | 11/1000 [00:15<13:43,  1.20it/s]  1%|  | 12/1000 [00:16<13:25,  1.23it/s]                                         {'loss': '2.764', 'grad_norm': '0.3051', 'learning_rate': '7.333e-05', 'ppl': '15.87', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1570', 'tokens/total': 166016, 'tokens/trainable': 64759, 'epoch': '0.005853'}
+  1%|  | 12/1000 [00:16<13:25,  1.23it/s]  1%|  | 13/1000 [00:17<12:59,  1.27it/s]                                         {'loss': '2.832', 'grad_norm': '0.2587', 'learning_rate': '8e-05', 'ppl': '16.98', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2915', 'tokens/total': 179584, 'tokens/trainable': 70338, 'epoch': '0.006341'}
+  1%|  | 13/1000 [00:17<12:59,  1.27it/s]  1%|  | 14/1000 [00:18<12:57,  1.27it/s]                                         {'loss': '2.69', 'grad_norm': '0.2952', 'learning_rate': '8.667e-05', 'ppl': '14.73', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '335.9', 'tokens/total': 195072, 'tokens/trainable': 73958, 'epoch': '0.006828'}
+  1%|  | 14/1000 [00:18<12:57,  1.27it/s]  2%|  | 15/1000 [00:18<12:36,  1.30it/s]                                         {'loss': '3.028', 'grad_norm': '0.275', 'learning_rate': '9.333e-05', 'ppl': '20.65', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1891', 'tokens/total': 208512, 'tokens/trainable': 78154, 'epoch': '0.007316'}
+  2%|  | 15/1000 [00:18<12:36,  1.30it/s]  2%|  | 16/1000 [00:19<12:35,  1.30it/s]                                         {'loss': '2.946', 'grad_norm': '0.241', 'learning_rate': '0.0001', 'ppl': '19.03', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2310', 'tokens/total': 223232, 'tokens/trainable': 85571, 'epoch': '0.007804'}
+  2%|  | 16/1000 [00:19<12:35,  1.30it/s]  2%|  | 17/1000 [00:20<12:27,  1.32it/s]                                         {'loss': '2.851', 'grad_norm': '0.2425', 'learning_rate': '0.0001067', 'ppl': '17.31', 'memory/max_active (GiB)': '15.53', 'memory/max_allocated (GiB)': '15.53', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2000', 'tokens/total': 236928, 'tokens/trainable': 90407, 'epoch': '0.008292'}
+  2%|  | 17/1000 [00:20<12:27,  1.32it/s]  2%|  | 18/1000 [00:21<12:27,  1.31it/s]                                         {'loss': '2.833', 'grad_norm': '0.2086', 'learning_rate': '0.0001133', 'ppl': '17', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2103', 'tokens/total': 251520, 'tokens/trainable': 95783, 'epoch': '0.008779'}
+  2%|  | 18/1000 [00:21<12:27,  1.31it/s]  2%|  | 19/1000 [00:21<12:18,  1.33it/s]                                         {'loss': '2.608', 'grad_norm': '0.1785', 'learning_rate': '0.00012', 'ppl': '13.57', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2689', 'tokens/total': 265216, 'tokens/trainable': 102772, 'epoch': '0.009267'}
+  2%|  | 19/1000 [00:21<12:18,  1.33it/s]  2%|  | 20/1000 [00:22<12:22,  1.32it/s]                                         {'loss': '2.611', 'grad_norm': '0.1852', 'learning_rate': '0.0001267', 'ppl': '13.62', 'memory/max_active (GiB)': '16.42', 'memory/max_allocated (GiB)': '16.42', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2319', 'tokens/total': 279552, 'tokens/trainable': 108546, 'epoch': '0.009755'}
+  2%|  | 20/1000 [00:22<12:22,  1.32it/s]  2%|  | 21/1000 [00:23<12:13,  1.34it/s]                                         {'loss': '2.783', 'grad_norm': '0.174', 'learning_rate': '0.0001333', 'ppl': '16.17', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '3427', 'tokens/total': 293120, 'tokens/trainable': 114759, 'epoch': '0.01024'}
+  2%|  | 21/1000 [00:23<12:13,  1.34it/s]  2%|  | 22/1000 [00:24<12:18,  1.32it/s]                                         {'loss': '2.561', 'grad_norm': '0.1548', 'learning_rate': '0.00014', 'ppl': '12.94', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2871', 'tokens/total': 307840, 'tokens/trainable': 123248, 'epoch': '0.01073'}
+  2%|  | 22/1000 [00:24<12:18,  1.32it/s]  2%|  | 23/1000 [00:24<12:12,  1.33it/s]                                         {'loss': '2.685', 'grad_norm': '0.2033', 'learning_rate': '0.0001467', 'ppl': '14.66', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1622', 'tokens/total': 321408, 'tokens/trainable': 127914, 'epoch': '0.01122'}
+  2%|  | 23/1000 [00:24<12:12,  1.33it/s]  2%|  | 24/1000 [00:25<12:03,  1.35it/s]                                         {'loss': '2.635', 'grad_norm': '0.1711', 'learning_rate': '0.0001533', 'ppl': '13.94', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1389', 'tokens/total': 334464, 'tokens/trainable': 132556, 'epoch': '0.01171'}
+  2%|  | 24/1000 [00:25<12:03,  1.35it/s]  2%|  | 25/1000 [00:26<12:07,  1.34it/s]                                         {'loss': '2.714', 'grad_norm': '0.1771', 'learning_rate': '0.00016', 'ppl': '15.1', 'memory/max_active (GiB)': '15.98', 'memory/max_allocated (GiB)': '15.98', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2693', 'tokens/total': 348928, 'tokens/trainable': 138076, 'epoch': '0.01219'}
+  2%|  | 25/1000 [00:26<12:07,  1.34it/s]  3%|  | 26/1000 [00:27<12:03,  1.35it/s]                                         {'loss': '2.664', 'grad_norm': '0.2041', 'learning_rate': '0.0001667', 'ppl': '14.36', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1817', 'tokens/total': 362496, 'tokens/trainable': 142370, 'epoch': '0.01268'}
+  3%|  | 26/1000 [00:27<12:03,  1.35it/s]  3%|  | 27/1000 [00:27<12:01,  1.35it/s]                                         {'loss': '2.661', 'grad_norm': '0.2129', 'learning_rate': '0.0001733', 'ppl': '14.32', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1870', 'tokens/total': 376320, 'tokens/trainable': 148590, 'epoch': '0.01317'}
+  3%|  | 27/1000 [00:27<12:01,  1.35it/s]  3%|  | 28/1000 [00:28<12:11,  1.33it/s]                                         {'loss': '2.501', 'grad_norm': '0.1584', 'learning_rate': '0.00018', 'ppl': '12.19', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2527', 'tokens/total': 391552, 'tokens/trainable': 156847, 'epoch': '0.01366'}
+  3%|  | 28/1000 [00:28<12:11,  1.33it/s]  3%|  | 29/1000 [00:29<11:54,  1.36it/s]                                         {'loss': '2.739', 'grad_norm': '0.2235', 'learning_rate': '0.0001867', 'ppl': '15.47', 'memory/max_active (GiB)': '14.2', 'memory/max_allocated (GiB)': '14.2', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '984.4', 'tokens/total': 403968, 'tokens/trainable': 159811, 'epoch': '0.01414'}
+  3%|  | 29/1000 [00:29<11:54,  1.36it/s]  3%|  | 30/1000 [00:30<11:55,  1.36it/s]                                         {'loss': '2.745', 'grad_norm': '0.1478', 'learning_rate': '0.0001933', 'ppl': '15.56', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2664', 'tokens/total': 417792, 'tokens/trainable': 166680, 'epoch': '0.01463'}
+  3%|  | 30/1000 [00:30<11:55,  1.36it/s]  3%|  | 31/1000 [00:30<11:49,  1.37it/s]                                         {'loss': '2.648', 'grad_norm': '0.1657', 'learning_rate': '0.0002', 'ppl': '14.13', 'memory/max_active (GiB)': '14.74', 'memory/max_allocated (GiB)': '14.74', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2487', 'tokens/total': 430848, 'tokens/trainable': 173012, 'epoch': '0.01512'}
+  3%|  | 31/1000 [00:30<11:49,  1.37it/s]  3%|  | 32/1000 [00:31<12:00,  1.34it/s]                                         {'loss': '2.49', 'grad_norm': '0.1343', 'learning_rate': '0.0002', 'ppl': '12.07', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '3640', 'tokens/total': 445184, 'tokens/trainable': 180698, 'epoch': '0.01561'}
+  3%|  | 32/1000 [00:31<12:00,  1.34it/s]  3%|  | 33/1000 [00:32<11:57,  1.35it/s]                                         {'loss': '2.712', 'grad_norm': '0.1401', 'learning_rate': '0.0002', 'ppl': '15.06', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2099', 'tokens/total': 458880, 'tokens/trainable': 188020, 'epoch': '0.0161'}
+  3%|  | 33/1000 [00:32<11:57,  1.35it/s]  3%|  | 34/1000 [00:32<12:01,  1.34it/s]                                         {'loss': '2.592', 'grad_norm': '0.15', 'learning_rate': '0.0002', 'ppl': '13.36', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2166', 'tokens/total': 473216, 'tokens/trainable': 195907, 'epoch': '0.01658'}
+  3%|  | 34/1000 [00:32<12:01,  1.34it/s]  4%|  | 35/1000 [00:33<11:56,  1.35it/s]                                         {'loss': '2.783', 'grad_norm': '0.152', 'learning_rate': '0.0002', 'ppl': '16.17', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1919', 'tokens/total': 486784, 'tokens/trainable': 202826, 'epoch': '0.01707'}
+  4%|  | 35/1000 [00:33<11:56,  1.35it/s]  4%|  | 36/1000 [00:34<12:05,  1.33it/s]                                         {'loss': '2.715', 'grad_norm': '0.1309', 'learning_rate': '0.0002', 'ppl': '15.11', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '3714', 'tokens/total': 501632, 'tokens/trainable': 211084, 'epoch': '0.01756'}
+  4%|  | 36/1000 [00:34<12:05,  1.33it/s]  4%|  | 37/1000 [00:35<12:13,  1.31it/s]                                         {'loss': '2.59', 'grad_norm': '0.1387', 'learning_rate': '0.0002', 'ppl': '13.33', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1159', 'tokens/total': 516864, 'tokens/trainable': 219294, 'epoch': '0.01805'}
+  4%|  | 37/1000 [00:35<12:13,  1.31it/s]  4%|  | 38/1000 [00:36<12:04,  1.33it/s]                                         {'loss': '2.852', 'grad_norm': '0.1592', 'learning_rate': '0.0002', 'ppl': '17.32', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1172', 'tokens/total': 530560, 'tokens/trainable': 225756, 'epoch': '0.01853'}
+  4%|  | 38/1000 [00:36<12:04,  1.33it/s]  4%|  | 39/1000 [00:36<12:06,  1.32it/s]                                         {'loss': '2.544', 'grad_norm': '0.1455', 'learning_rate': '0.0002', 'ppl': '12.73', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2376', 'tokens/total': 544512, 'tokens/trainable': 231931, 'epoch': '0.01902'}
+  4%|  | 39/1000 [00:36<12:06,  1.32it/s]  4%|  | 40/1000 [00:37<12:04,  1.32it/s]                                         {'loss': '2.637', 'grad_norm': '0.1575', 'learning_rate': '0.0002', 'ppl': '13.97', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2545', 'tokens/total': 558720, 'tokens/trainable': 239133, 'epoch': '0.01951'}
+  4%|  | 40/1000 [00:37<12:04,  1.32it/s]  4%|  | 41/1000 [00:38<11:59,  1.33it/s]                                         {'loss': '2.739', 'grad_norm': '0.2001', 'learning_rate': '0.0001999', 'ppl': '15.47', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '316.7', 'tokens/total': 572544, 'tokens/trainable': 242626, 'epoch': '0.02'}
+  4%|  | 41/1000 [00:38<11:59,  1.33it/s]  4%|  | 42/1000 [00:39<12:01,  1.33it/s]                                         {'loss': '2.484', 'grad_norm': '0.1508', 'learning_rate': '0.0001999', 'ppl': '11.99', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2302', 'tokens/total': 587008, 'tokens/trainable': 248434, 'epoch': '0.02049'}
+  4%|  | 42/1000 [00:39<12:01,  1.33it/s]  4%|  | 43/1000 [00:39<12:14,  1.30it/s]                                         {'loss': '2.584', 'grad_norm': '0.1343', 'learning_rate': '0.0001999', 'ppl': '13.25', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1963', 'tokens/total': 602624, 'tokens/trainable': 256104, 'epoch': '0.02097'}
+  4%|  | 43/1000 [00:39<12:14,  1.30it/s]  4%|  | 44/1000 [00:40<12:10,  1.31it/s]                                         {'loss': '2.591', 'grad_norm': '0.1752', 'learning_rate': '0.0001999', 'ppl': '13.35', 'memory/max_active (GiB)': '15.18', 'memory/max_allocated (GiB)': '15.18', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2604', 'tokens/total': 616704, 'tokens/trainable': 260763, 'epoch': '0.02146'}
+  4%|  | 44/1000 [00:40<12:10,  1.31it/s]  4%|  | 45/1000 [00:41<12:14,  1.30it/s]                                         {'loss': '2.652', 'grad_norm': '0.1738', 'learning_rate': '0.0001999', 'ppl': '14.18', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '2554', 'tokens/total': 631936, 'tokens/trainable': 265673, 'epoch': '0.02195'}
+  4%|  | 45/1000 [00:41<12:14,  1.30it/s]  5%|  | 46/1000 [00:42<12:05,  1.31it/s]                                         {'loss': '2.516', 'grad_norm': '0.135', 'learning_rate': '0.0001999', 'ppl': '12.38', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1692', 'tokens/total': 645632, 'tokens/trainable': 272630, 'epoch': '0.02244'}
+  5%|  | 46/1000 [00:42<12:05,  1.31it/s]  5%|  | 47/1000 [00:42<12:11,  1.30it/s]                                         {'loss': '2.506', 'grad_norm': '0.1401', 'learning_rate': '0.0001999', 'ppl': '12.26', 'memory/max_active (GiB)': '16.42', 'memory/max_allocated (GiB)': '16.42', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1721', 'tokens/total': 660992, 'tokens/trainable': 279679, 'epoch': '0.02292'}
+  5%|  | 47/1000 [00:42<12:11,  1.30it/s]  5%|  | 48/1000 [00:43<11:54,  1.33it/s]                                         {'loss': '2.877', 'grad_norm': '0.172', 'learning_rate': '0.0001998', 'ppl': '17.75', 'memory/max_active (GiB)': '16.42', 'memory/max_allocated (GiB)': '16.42', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '824.1', 'tokens/total': 673792, 'tokens/trainable': 284979, 'epoch': '0.02341'}
+  5%|  | 48/1000 [00:43<11:54,  1.33it/s]  5%|  | 49/1000 [00:44<11:59,  1.32it/s]                                         {'loss': '2.646', 'grad_norm': '0.126', 'learning_rate': '0.0001998', 'ppl': '14.1', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '4063', 'tokens/total': 688384, 'tokens/trainable': 293596, 'epoch': '0.0239'}
+  5%|  | 49/1000 [00:44<11:59,  1.32it/s]  5%|  | 50/1000 [00:45<11:59,  1.32it/s]                                         {'loss': '2.639', 'grad_norm': '0.169', 'learning_rate': '0.0001998', 'ppl': '14', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '38.86', 'tokens/train_per_sec_per_gpu': '1653', 'tokens/total': 702976, 'tokens/trainable': 298477, 'epoch': '0.02439'}
+  5%|  | 50/1000 [00:45<11:59,  1.32it/s][2026-03-30 14:35:59,107] [INFO] [axolotl.core.trainers.base.evaluate:401] [PID:37135] Running evaluation step...
+
+  0%|            | 0/100 [00:00<?, ?it/s][A
+  3%|    | 3/100 [00:00<00:03, 26.18it/s][A
+  6%|▏   | 6/100 [00:00<00:05, 18.74it/s][A
+  8%|▎   | 8/100 [00:00<00:05, 17.95it/s][A
+ 10%|▎  | 10/100 [00:00<00:05, 16.78it/s][A
+ 12%|▎  | 12/100 [00:00<00:05, 17.58it/s][A
+ 14%|▍  | 14/100 [00:00<00:04, 17.29it/s][A
+ 16%|▍  | 16/100 [00:00<00:04, 17.27it/s][A
+ 18%|▌  | 18/100 [00:01<00:04, 17.40it/s][A
+ 20%|▌  | 20/100 [00:01<00:04, 17.65it/s][A
+ 22%|▋  | 22/100 [00:01<00:04, 17.15it/s][A
+ 24%|▋  | 24/100 [00:01<00:04, 17.75it/s][A
+ 26%|▊  | 26/100 [00:01<00:04, 17.10it/s][A
+ 28%|▊  | 28/100 [00:01<00:04, 17.13it/s][A
+ 30%|▉  | 30/100 [00:01<00:04, 16.73it/s][A
+ 32%|▉  | 32/100 [00:01<00:04, 16.81it/s][A
+ 34%|█  | 34/100 [00:01<00:03, 16.92it/s][A
+ 37%|█  | 37/100 [00:02<00:03, 17.30it/s][A
+ 39%|█▏ | 39/100 [00:02<00:03, 17.32it/s][A
+ 41%|█▏ | 41/100 [00:02<00:03, 17.53it/s][A
+ 44%|█▎ | 44/100 [00:02<00:03, 18.21it/s][A
+ 46%|█▍ | 46/100 [00:02<00:03, 17.31it/s][A
+ 48%|█▍ | 48/100 [00:02<00:02, 17.71it/s][A
+ 50%|█▌ | 50/100 [00:02<00:02, 17.06it/s][A
+ 52%|█▌ | 52/100 [00:02<00:02, 17.05it/s][A
+ 54%|█▌ | 54/100 [00:03<00:02, 16.40it/s][A
+ 56%|█▋ | 56/100 [00:03<00:02, 16.72it/s][A
+ 58%|█▋ | 58/100 [00:03<00:02, 16.60it/s][A
+ 60%|█▊ | 60/100 [00:03<00:02, 17.06it/s][A
+ 62%|█▊ | 62/100 [00:03<00:02, 17.49it/s][A
+ 64%|█▉ | 64/100 [00:03<00:02, 17.69it/s][A
+ 66%|█▉ | 66/100 [00:03<00:02, 16.93it/s][A
+ 68%|██ | 68/100 [00:03<00:01, 17.48it/s][A
+ 70%|██ | 70/100 [00:04<00:01, 16.88it/s][A
+ 72%|██▏| 72/100 [00:04<00:01, 17.39it/s][A
+ 74%|██▏| 74/100 [00:04<00:01, 16.48it/s][A
+ 77%|██▎| 77/100 [00:04<00:01, 17.11it/s][A
+ 79%|██▎| 79/100 [00:04<00:01, 17.53it/s][A
+ 81%|██▍| 81/100 [00:04<00:01, 17.26it/s][A
+ 84%|██▌| 84/100 [00:04<00:00, 18.46it/s][A
+ 86%|██▌| 86/100 [00:04<00:00, 17.77it/s][A
+ 89%|██▋| 89/100 [00:05<00:00, 17.98it/s][A
+ 91%|██▋| 91/100 [00:05<00:00, 18.32it/s][A
+ 93%|██▊| 93/100 [00:05<00:00, 17.22it/s][A
+ 95%|██▊| 95/100 [00:05<00:00, 16.84it/s][A
+ 97%|██▉| 97/100 [00:05<00:00, 16.92it/s][A
+100%|██| 100/100 [00:05<00:00, 16.45it/s][A                                         
+                                         [A{'eval_loss': '2.694', 'eval_runtime': '6.185', 'eval_samples_per_second': '32.34', 'eval_steps_per_second': '16.17', 'eval_ppl': '14.8', 'memory/max_active (GiB)': '11.76', 'memory/max_allocated (GiB)': '11.76', 'memory/device_reserved (GiB)': '38.86', 'epoch': '0.02439', 'tokens/train_per_sec_per_gpu': '0'}
+  5%|  | 50/1000 [00:51<11:59,  1.32it/s]
+100%|██| 100/100 [00:05<00:00, 16.45it/s][A
+                                         [A[2026-03-30 14:36:05,317] [INFO] [axolotl.core.trainers.base._save:722] [PID:37135] Saving model checkpoint to /workspace/data/axolotl-outputs/sft/gemma-2-2b-it-rp-sft-qlora/checkpoint-50
+  5%|  | 51/1000 [00:54<52:12,  3.30s/it]                                         {'loss': '2.365', 'grad_norm': '0.1699', 'learning_rate': '0.0001998', 'ppl': '10.65', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '35.34', 'tokens/train_per_sec_per_gpu': '2401', 'tokens/total': 717056, 'tokens/trainable': 302602, 'epoch': '0.02488'}
+  5%|  | 51/1000 [00:54<52:12,  3.30s/it]  5%|  | 52/1000 [00:55<39:42,  2.51s/it]                                         {'loss': '3.102', 'grad_norm': '0.2156', 'learning_rate': '0.0001998', 'ppl': '22.25', 'memory/max_active (GiB)': '15.09', 'memory/max_allocated (GiB)': '15.09', 'memory/device_reserved (GiB)': '35.34', 'tokens/train_per_sec_per_gpu': '1588', 'tokens/total': 728704, 'tokens/trainable': 307033, 'epoch': '0.02536'}
+  5%|  | 52/1000 [00:55<39:42,  2.51s/it]  5%|  | 53/1000 [00:55<31:21,  1.99s/it]                                         {'loss': '2.689', 'grad_norm': '0.1493', 'learning_rate': '0.0001997', 'ppl': '14.72', 'memory/max_active (GiB)': '16.51', 'memory/max_allocated (GiB)': '16.51', 'memory/device_reserved (GiB)': '58.42', 'tokens/train_per_sec_per_gpu': '3790', 'tokens/total': 743040, 'tokens/trainable': 313364, 'epoch': '0.02585'}
+  5%|  | 53/1000 [00:55<31:21,  1.99s/it]  5%|  | 54/1000 [00:56<25:31,  1.62s/it]                                         {'loss': '2.762', 'grad_norm': '0.1468', 'learning_rate': '0.0001997', 'ppl': '15.83', 'memory/max_active (GiB)': '16.07', 'memory/max_allocated (GiB)': '16.07', 'memory/device_reserved (GiB)': '58.42', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 757504, 'tokens/trainable': 320183, 'epoch': '0.02634'}
+  5%|  | 54/1000 [00:56<25:31,  1.62s/it]  6%|  | 55/1000 [00:57<21:06,  1.34s/it]                                         {'loss': '2.652', 'grad_norm': '0.1634', 'learning_rate': '0.0001997', 'ppl': '14.18', 'memory/max_active (GiB)': '15.63', 'memory/max_allocated (GiB)': '15.63', 'memory/device_reserved (GiB)': '58.42', 'tokens/train_per_sec_per_gpu': '1216', 'tokens/total': 769792, 'tokens/trainable': 324651, 'epoch': '0.02683'}
+  6%|  | 55/1000 [00:57<21:06,  1.34s/it]
\ No newline at end of file