diff --git "a/debug.log" "b/debug.log"
new file mode 100644--- /dev/null
+++ "b/debug.log"
@@ -0,0 +1,763 @@
+[2025-12-28 11:04:35,744] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:42410] baseline 0.000GB ()
+[2025-12-28 11:04:35,746] [INFO] [axolotl.cli.config.load_cfg:256] [PID:42410] config:
+{
+  "activation_offloading": false,
+  "adapter": "lora",
+  "axolotl_config_path": "tuner.yaml",
+  "base_model": "codellama/CodeLlama-7b-hf",
+  "base_model_config": "codellama/CodeLlama-7b-hf",
+  "batch_size": 8,
+  "bf16": true,
+  "capabilities": {
+    "bf16": true,
+    "compute_capability": "sm_90",
+    "fp8": false,
+    "n_gpu": 1,
+    "n_node": 1
+  },
+  "chat_template": "llama3",
+  "context_parallel_size": 1,
+  "dataloader_num_workers": 1,
+  "dataloader_pin_memory": true,
+  "dataloader_prefetch_factor": 256,
+  "dataset_num_proc": 384,
+  "datasets": [
+    {
+      "chat_template": "tokenizer_default",
+      "conversation": "llama3",
+      "field_messages": "messages",
+      "message_property_mappings": {
+        "content": "content",
+        "role": "role"
+      },
+      "path": "darwinkernelpanic/luau-reasoning-normalized",
+      "trust_remote_code": false,
+      "type": "chat_template"
+    }
+  ],
+  "ddp": false,
+  "deepspeed": {
+    "bf16": {
+      "enabled": true
+    },
+    "fp16": {
+      "enabled": false
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "steps_per_print": 2000,
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_optimization": {
+      "contiguous_gradients": true,
+      "gather_16bit_weights_on_model_save": true,
+      "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "cpu",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "reduce_bucket_size": "auto",
+      "stage": 3,
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_prefetch_bucket_size": "auto"
+    }
+  },
+  "device": "cuda:0",
+  "dion_rank_fraction": 1.0,
+  "dion_rank_multiple_of": 1,
+  "env_capabilities": {
+    "torch_version": "2.8.0"
+  },
+  "eval_batch_size": 4,
+  "eval_causal_lm_metrics": [
+    "sacrebleu",
+    "comet",
+    "ter",
+    "chrf"
+  ],
+  "eval_max_new_tokens": 128,
+  "eval_sample_packing": true,
+  "eval_steps": 100,
+  "eval_table_size": 0,
+  "experimental_skip_move_to_device": true,
+  "fp16": false,
+  "gradient_accumulation_steps": 2,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": {
+    "use_reentrant": true
+  },
+  "group_by_length": true,
+  "hub_model_id": "darwinkernelpanic/luau-codellama-7b-reasoning",
+  "hub_strategy": "every_save",
+  "include_tkps": true,
+  "is_falcon_derived_model": false,
+  "is_llama_derived_model": true,
+  "is_mistral_derived_model": false,
+  "learning_rate": 0.0002,
+  "lisa_layers_attribute": "model.layers",
+  "load_best_model_at_end": false,
+  "load_in_4bit": false,
+  "load_in_8bit": false,
+  "local_rank": 0,
+  "logging_steps": 1,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_r": 16,
+  "lora_target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "loraplus_lr_embedding": 1e-06,
+  "lr_scheduler": "cosine",
+  "mean_resizing_embeddings": false,
+  "micro_batch_size": 4,
+  "model_config_type": "llama",
+  "num_epochs": 3.0,
+  "optimizer": "adamw_torch",
+  "otel_metrics_host": "localhost",
+  "otel_metrics_port": 8000,
+  "output_dir": "./outputs/luau-codellama-h200",
+  "pad_to_sequence_len": true,
+  "pretrain_multipack_attn": true,
+  "profiler_steps_start": 0,
+  "qlora_sharded_model_loading": false,
+  "ray_num_workers": 1,
+  "resources_per_worker": {
+    "GPU": 1
+  },
+  "sample_packing": true,
+  "sample_packing_bin_size": 200,
+  "sample_packing_group_size": 100000,
+  "save_only_model": false,
+  "save_safetensors": true,
+  "save_steps": 200,
+  "save_strategy": "steps",
+  "save_total_limit": 3,
+  "seed": 42,
+  "sequence_len": 4096,
+  "shuffle_before_merging_datasets": false,
+  "shuffle_merged_datasets": true,
+  "skip_prepare_dataset": false,
+  "streaming_multipack_buffer_size": 10000,
+  "strict": false,
+  "tensor_parallel_size": 1,
+  "tf32": true,
+  "tiled_mlp_use_original_mlp": true,
+  "tokenizer_config": "codellama/CodeLlama-7b-hf",
+  "tokenizer_save_jinja_files": true,
+  "tokenizer_type": "LlamaTokenizer",
+  "torch_dtype": "torch.bfloat16",
+  "train_on_inputs": false,
+  "trl": {
+    "log_completions": false,
+    "mask_truncated_completions": false,
+    "ref_model_mixup_alpha": 0.9,
+    "ref_model_sync_steps": 64,
+    "scale_rewards": true,
+    "sync_ref_model": false,
+    "use_vllm": false,
+    "vllm_server_host": "0.0.0.0",
+    "vllm_server_port": 8000
+  },
+  "type_of_model": "LlamaForCausalLM",
+  "use_otel_metrics": false,
+  "use_ray": false,
+  "val_set_size": 0.05,
+  "vllm": {
+    "device": "auto",
+    "dtype": "auto",
+    "gpu_memory_utilization": 0.9,
+    "host": "0.0.0.0",
+    "port": 8000
+  },
+  "warmup_steps": 10,
+  "weight_decay": 0.0,
+  "world_size": 1
+}
+[2025-12-28 11:04:36,377] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42410] EOS: 2 / </s>
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42410] BOS: 1 / <s>
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:42410] PAD: 2 / </s>
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:42410] UNK: 0 / <unk>
+[2025-12-28 11:04:36,378] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:42410] Unable to find prepared dataset in last_run_prepared/b7c17715ff7f64badeb455c51ab5d648
+[2025-12-28 11:04:36,378] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:42410] Loading raw datasets...
+[2025-12-28 11:04:36,378] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:42410] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
+[2025-12-28 11:04:38,127] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42410] Loading dataset: darwinkernelpanic/luau-reasoning-normalized with base_type: chat_template and prompt_style: None
+[2025-12-28 11:04:38,130] [INFO] [axolotl.prompt_strategies.chat_template.__call__:996] [PID:42410] Using chat template:
+---
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
+
+' }}{% endif %}
+
+---
+[2025-12-28 11:04:38,137] [WARNING] [axolotl.prompt_strategies.chat_template._validate_eot_and_eos_tokens:337] [PID:42410] EOS token '</s>' not found in chat_template. Please check if your template/EOS token is correct.
+[2025-12-28 11:04:38,508] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:42410] min_input_len: 636
+[2025-12-28 11:04:38,508] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:42410] max_input_len: 12839
+[2025-12-28 11:04:41,234] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:42410] Dropped 755 samples from dataset
+Saving the dataset (0/56 shards):   0%|                                                                                   | 0/14586 [00:00<?, ? examples/s]Saving the dataset (0/56 shards):   2%|█▎                                                                      | 261/14586 [00:01<00:55, 257.60 examples/s]Saving the dataset (1/56 shards):   2%|█▎                                                                      | 261/14586 [00:01<00:55, 257.60 examples/s]Saving the dataset (2/56 shards):   9%|██████▎                                                                | 1305/14586 [00:01<00:51, 257.60 examples/s]Saving the dataset (3/56 shards):   9%|██████▎                                                                | 1305/14586 [00:01<00:51, 257.60 examples/s]Saving the dataset (4/56 shards):   9%|██████▎                                                                | 1305/14586 [00:01<00:51, 257.60 examples/s]Saving the dataset (5/56 shards):   9%|██████▎                                                                | 1305/14586 [00:01<00:51, 257.60 examples/s]Saving the dataset (6/56 shards):  11%|███████▌                                                               | 1566/14586 [00:01<00:50, 257.60 examples/s]Saving the dataset (7/56 shards):  14%|██████████▏                                                            | 2088/14586 [00:01<00:48, 257.60 examples/s]Saving the dataset (8/56 shards):  18%|████████████▋                                                          | 2610/14586 [00:01<00:46, 257.60 examples/s]Saving the dataset (9/56 shards):  18%|████████████▋                                                          | 2610/14586 [00:01<00:46, 257.60 examples/s]Saving the dataset (10/56 shards):  18%|████████████▌                                                         | 2610/14586 [00:01<00:46, 257.60 examples/s]Saving the dataset (11/56 shards):  21%|███████████████                                                       | 3132/14586 [00:01<00:44, 257.60 examples/s]Saving the dataset (12/56 shards):  21%|███████████████                                                       | 3132/14586 [00:01<00:44, 257.60 examples/s]Saving the dataset (13/56 shards):  27%|██████████████████▊                                                   | 3915/14586 [00:01<00:41, 257.60 examples/s]Saving the dataset (14/56 shards):  27%|██████████████████▊                                                   | 3915/14586 [00:01<00:41, 257.60 examples/s]Saving the dataset (15/56 shards):  29%|████████████████████                                                  | 4176/14586 [00:01<00:40, 257.60 examples/s]Saving the dataset (16/56 shards):  29%|████████████████████                                                  | 4176/14586 [00:01<00:40, 257.60 examples/s]Saving the dataset (17/56 shards):  30%|█████████████████████▎                                                | 4437/14586 [00:01<00:39, 257.60 examples/s]Saving the dataset (18/56 shards):  36%|█████████████████████████                                             | 5220/14586 [00:01<00:36, 257.60 examples/s]Saving the dataset (19/56 shards):  36%|█████████████████████████                                             | 5220/14586 [00:01<00:36, 257.60 examples/s]Saving the dataset (20/56 shards):  36%|█████████████████████████                                             | 5220/14586 [00:01<00:36, 257.60 examples/s]Saving the dataset (21/56 shards):  39%|███████████████████████████▌                                          | 5742/14586 [00:01<00:34, 257.60 examples/s]Saving the dataset (22/56 shards):  39%|███████████████████████████▌                                          | 5742/14586 [00:01<00:34, 257.60 examples/s]Saving the dataset (23/56 shards):  45%|███████████████████████████████▎                                      | 6525/14586 [00:01<00:31, 257.60 examples/s]Saving the dataset (24/56 shards):  45%|███████████████████████████████▎                                      | 6525/14586 [00:01<00:31, 257.60 examples/s]Saving the dataset (25/56 shards):  45%|███████████████████████████████▎                                      | 6525/14586 [00:01<00:31, 257.60 examples/s]Saving the dataset (26/56 shards):  50%|███████████████████████████████████                                   | 7306/14586 [00:01<00:28, 257.60 examples/s]Saving the dataset (27/56 shards):  50%|███████████████████████████████████                                   | 7306/14586 [00:01<00:28, 257.60 examples/s]Saving the dataset (28/56 shards):  50%|███████████████████████████████████                                   | 7306/14586 [00:01<00:28, 257.60 examples/s]Saving the dataset (29/56 shards):  52%|████████████████████████████████████▎                                 | 7566/14586 [00:01<00:27, 257.60 examples/s]Saving the dataset (30/56 shards):  55%|██████████████████████████████████████▊                               | 8086/14586 [00:01<00:25, 257.60 examples/s]Saving the dataset (31/56 shards):  57%|████████████████████████████████████████                              | 8346/14586 [00:01<00:24, 257.60 examples/s]Saving the dataset (32/56 shards):  57%|████████████████████████████████████████                              | 8346/14586 [00:01<00:24, 257.60 examples/s]Saving the dataset (33/56 shards):  64%|█████████████████████████████████████████████                         | 9386/14586 [00:01<00:20, 257.60 examples/s]Saving the dataset (34/56 shards):  64%|█████████████████████████████████████████████                         | 9386/14586 [00:01<00:20, 257.60 examples/s]Saving the dataset (35/56 shards):  64%|█████████████████████████████████████████████                         | 9386/14586 [00:01<00:20, 257.60 examples/s]Saving the dataset (36/56 shards):  66%|██████████████████████████████████████████████▎                       | 9646/14586 [00:01<00:19, 257.60 examples/s]Saving the dataset (37/56 shards):  66%|██████████████████████████████████████████████▎                       | 9646/14586 [00:01<00:19, 257.60 examples/s]Saving the dataset (38/56 shards):  68%|███████████████████████████████████████████████▌                      | 9906/14586 [00:01<00:18, 257.60 examples/s]Saving the dataset (39/56 shards):  71%|█████████████████████████████████████████████████▎                   | 10426/14586 [00:01<00:16, 257.60 examples/s]Saving the dataset (40/56 shards):  71%|█████████████████████████████████████████████████▎                   | 10426/14586 [00:01<00:16, 257.60 examples/s]Saving the dataset (41/56 shards):  73%|██████████████████████████████████████████████████▌                  | 10686/14586 [00:01<00:15, 257.60 examples/s]Saving the dataset (42/56 shards):  75%|███████████████████████████████████████████████████▊                 | 10946/14586 [00:01<00:14, 257.60 examples/s]Saving the dataset (43/56 shards):  77%|█████████████████████████████████████████████████████                | 11206/14586 [00:01<00:13, 257.60 examples/s]Saving the dataset (44/56 shards):  80%|███████████████████████████████████████████████████████▍             | 11726/14586 [00:01<00:11, 257.60 examples/s]Saving the dataset (45/56 shards):  80%|███████████████████████████████████████████████████████▍             | 11726/14586 [00:01<00:11, 257.60 examples/s]Saving the dataset (46/56 shards):  86%|███████████████████████████████████████████████████████████▏         | 12506/14586 [00:01<00:08, 257.60 examples/s]Saving the dataset (47/56 shards):  89%|█████████████████████████████████████████████████████████████▌       | 13026/14586 [00:01<00:06, 257.60 examples/s]Saving the dataset (48/56 shards):  91%|██████████████████████████████████████████████████████████████▊      | 13286/14586 [00:01<00:05, 257.60 examples/s]Saving the dataset (49/56 shards):  91%|██████████████████████████████████████████████████████████████▊      | 13286/14586 [00:01<00:05, 257.60 examples/s]Saving the dataset (50/56 shards):  91%|██████████████████████████████████████████████████████████████▊      | 13286/14586 [00:01<00:05, 257.60 examples/s]Saving the dataset (51/56 shards):  91%|██████████████████████████████████████████████████████████████▊      | 13286/14586 [00:01<00:05, 257.60 examples/s]Saving the dataset (52/56 shards):  95%|█████████████████████████████████████████████████████████████████▎   | 13806/14586 [00:01<00:03, 257.60 examples/s]Saving the dataset (53/56 shards):  95%|█████████████████████████████████████████████████████████████████▎   | 13806/14586 [00:01<00:03, 257.60 examples/s]Saving the dataset (54/56 shards):  96%|██████████████████████████████████████████████████████████████████▌  | 14066/14586 [00:01<00:02, 257.60 examples/s]Saving the dataset (55/56 shards): 100%|█████████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 257.60 examples/s]Saving the dataset (56/56 shards): 100%|█████████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 257.60 examples/s]Saving the dataset (56/56 shards): 100%|███████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 13067.01 examples/s]
+[2025-12-28 11:04:48,679] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:42410] total_num_tokens: 1_357_721
+[2025-12-28 11:04:48,684] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:42410] `total_supervised_tokens: 1_271_453`
+[2025-12-28 11:04:48,692] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,247] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,552] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3049201965332031
+[2025-12-28 11:04:49,552] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,839] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.2874150276184082
+[2025-12-28 11:04:49,840] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:50,133] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.2929878234863281
+[2025-12-28 11:04:50,133] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.27963781356811523
+[2025-12-28 11:04:50,413] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:42410] data_loader_len: 45
+[2025-12-28 11:04:50,413] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:42410] sample_packing_eff_est across ranks: [0.9131538664342287]
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:42410] sample_packing_eff_est: None
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:42410] total_num_steps: 135
+[2025-12-28 11:04:50,489] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:42410] total_num_tokens: 25_392_481
+[2025-12-28 11:04:50,608] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:42410] `total_supervised_tokens: 23_772_065`
+[2025-12-28 11:04:50,703] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,045] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,353] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.30963134765625
+[2025-12-28 11:04:51,355] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,664] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.31055235862731934
+[2025-12-28 11:04:51,666] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,976] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3107116222381592
+[2025-12-28 11:04:51,977] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3080286979675293
+[2025-12-28 11:04:52,284] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [1667]
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:42410] data_loader_len: 833
+[2025-12-28 11:04:52,284] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:42410] sample_packing_eff_est across ranks: [0.9284613122121649]
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:42410] sample_packing_eff_est: 0.93
+[2025-12-28 11:04:52,285] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:42410] total_num_steps: 2499
+[2025-12-28 11:04:52,287] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:42410] Maximum number of steps set at 2499
+[2025-12-28 11:04:52,291] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:42410] loading tokenizer... codellama/CodeLlama-7b-hf
+[2025-12-28 11:04:52,784] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42410] EOS: 2 / </s>
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42410] BOS: 1 / <s>
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:42410] PAD: 2 / </s>
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:42410] UNK: 0 / <unk>
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:42410] Loading model
+[2025-12-28 11:04:52,926] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:42410] Patched Trainer.evaluation_loop with nanmean loss calculation
+[2025-12-28 11:04:52,927] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:42410] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
+[2025-12-28 11:04:52,927] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:42410] Applying multipack dataloader patch for sample packing...
+[2025-12-28 11:04:52,927] [INFO] [axolotl.loaders.patch_manager._patch_llama_sample_packing:430] [PID:42410] Patching llama _prepare_4d_causal_attention_mask*...
+Loading checkpoint shards:   0%|                                                                                                     | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|██████████████████████████████████████████████▌                                              | 1/2 [00:01<00:01,  1.42s/it]Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.14it/s]Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.04it/s]
+generation_config.json:   0%|                                                                                                    | 0.00/116 [00:00<?, ?B/s]generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 1.91MB/s]
+[2025-12-28 11:05:00,338] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:42410] Converting modules to torch.bfloat16
+[2025-12-28 11:05:00,339] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42410] Memory usage after model load 0.500GB (+0.500GB allocated, +0.510GB reserved)
+trainable params: 16,777,216 || all params: 6,755,323,904 || trainable%: 0.2484
+[2025-12-28 11:05:00,457] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42410] after adapters 0.063GB (+0.063GB allocated, +0.572GB reserved)
+[2025-12-28 11:05:05,368] [INFO] [axolotl.train.save_initial_configs:413] [PID:42410] Pre-saving adapter config to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,368] [INFO] [axolotl.train.save_initial_configs:417] [PID:42410] Pre-saving tokenizer to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,369] [INFO] [axolotl.train.save_initial_configs:422] [PID:42410] Pre-saving model config to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,370] [INFO] [axolotl.train.execute_training:212] [PID:42410] Starting trainer...
+[2025-12-28 11:05:07,213] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6575620174407959
+[2025-12-28 11:05:07,824] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6103956699371338
+[2025-12-28 11:05:08,456] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6313827037811279
+[2025-12-28 11:05:09,110] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.653618574142456
+[2025-12-28 11:05:09,110] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [1666]
+Parameter Offload - Persistent parameters statistics: param_count = 65, numel = 266240
+  0%|                                                                                                                             | 0/2499 [00:00<?, ?it/s][2025-12-28 11:05:36,131] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:05:37,595] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7198138236999512
+[2025-12-28 11:05:38,326] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7297320365905762
+[2025-12-28 11:05:39,063] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7372677326202393
+[2025-12-28 11:05:39,815] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7512753009796143
+[2025-12-28 11:05:39,815] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+  0%|                                                                                                                               | 0/90 [00:00<?, ?it/s][A
+  2%|██▋                                                                                                                    | 2/90 [00:00<00:25,  3.46it/s][A
+  3%|███▉                                                                                                                   | 3/90 [00:01<00:42,  2.04it/s][A
+  4%|█████▎                                                                                                                 | 4/90 [00:02<00:52,  1.65it/s][A
+  6%|██████▌                                                                                                                | 5/90 [00:03<01:06,  1.29it/s][A
+  7%|███████▉                                                                                                               | 6/90 [00:04<01:04,  1.29it/s][A
+  8%|█████████▎                                                                                                             | 7/90 [00:04<01:07,  1.23it/s][A
+  9%|██████████▌                                                                                                            | 8/90 [00:05<01:06,  1.23it/s][A
+ 10%|███████████▉                                                                                                           | 9/90 [00:06<01:07,  1.20it/s][A
+ 11%|█████████████                                                                                                         | 10/90 [00:07<01:06,  1.21it/s][A
+ 12%|██████████████▍                                                                                                       | 11/90 [00:08<01:07,  1.17it/s][A
+ 13%|███████████████▋                                                                                                      | 12/90 [00:09<01:05,  1.19it/s][A
+ 14%|█████████████████                                                                                                     | 13/90 [00:10<01:06,  1.16it/s][A
+ 16%|██████████████████▎                                                                                                   | 14/90 [00:10<01:04,  1.18it/s][A
+ 17%|███████████████████▋                                                                                                  | 15/90 [00:11<01:05,  1.15it/s][A
+ 18%|████████████████████▉                                                                                                 | 16/90 [00:12<01:02,  1.18it/s][A
+ 19%|██████████████████████▎                                                                                               | 17/90 [00:13<01:03,  1.16it/s][A
+ 20%|███████████████████████▌                                                                                              | 18/90 [00:14<01:01,  1.18it/s][A
+ 21%|████████████████████████▉                                                                                             | 19/90 [00:15<01:01,  1.16it/s][A
+ 22%|██████████████████████████▏                                                                                           | 20/90 [00:16<00:59,  1.18it/s][A
+ 23%|███████████████████████████▌                                                                                          | 21/90 [00:16<00:59,  1.15it/s][A
+ 24%|████████████████████████████▊                                                                                         | 22/90 [00:17<00:57,  1.18it/s][A
+ 26%|██████████████████████████████▏                                                                                       | 23/90 [00:18<00:59,  1.13it/s][A
+ 27%|███████████████████████████████▍                                                                                      | 24/90 [00:19<00:56,  1.16it/s][A
+ 28%|████████████████████████████████▊                                                                                     | 25/90 [00:20<00:57,  1.14it/s][A
+ 29%|██████████████████████████████████                                                                                    | 26/90 [00:21<00:54,  1.16it/s][A
+ 30%|███████████████████████████████████▍                                                                                  | 27/90 [00:22<00:55,  1.14it/s][A
+ 31%|████████████████████████████████████▋                                                                                 | 28/90 [00:22<00:53,  1.17it/s][A
+ 32%|██████████████████████████████████████                                                                                | 29/90 [00:23<00:53,  1.14it/s][A
+ 33%|███████████████████████████████████████▎                                                                              | 30/90 [00:24<00:51,  1.16it/s][A
+ 34%|████████████████████████████████████████▋                                                                             | 31/90 [00:25<00:52,  1.13it/s][A
+ 36%|█████████████████████████████████████████▉                                                                            | 32/90 [00:26<00:50,  1.16it/s][A
+ 37%|███████████████████████████████████████████▎                                                                          | 33/90 [00:27<00:50,  1.13it/s][A
+ 38%|████████████████████████████████████████████▌                                                                         | 34/90 [00:28<00:48,  1.16it/s][A
+ 39%|█████████████████████████████████████████████▉                                                                        | 35/90 [00:29<00:48,  1.14it/s][A
+ 40%|███████████████████████████████████████████████▏                                                                      | 36/90 [00:29<00:46,  1.17it/s][A
+ 41%|████████████████████████████████████████████████▌                                                                     | 37/90 [00:30<00:46,  1.14it/s][A
+ 42%|█████████████████████████████████████████████████▊                                                                    | 38/90 [00:31<00:44,  1.17it/s][A
+ 43%|███████████████████████████████████████████████████▏                                                                  | 39/90 [00:32<00:44,  1.14it/s][A
+ 44%|████████████████████████████████████████████████████▍                                                                 | 40/90 [00:33<00:42,  1.17it/s][A
+ 46%|█████████████████████████████████████████████████████▊                                                                | 41/90 [00:34<00:42,  1.15it/s][A
+ 47%|███████████████████████████████████████████████████████                                                               | 42/90 [00:35<00:40,  1.17it/s][A
+ 48%|████████████████████████████████████████████████████████▍                                                             | 43/90 [00:36<00:40,  1.15it/s][A
+ 49%|█████████████████████████████████████████████████████████▋                                                            | 44/90 [00:36<00:39,  1.17it/s][A
+ 50%|███████████████████████████████████████████████████████████                                                           | 45/90 [00:37<00:39,  1.15it/s][A
+ 51%|████████████████████████████████████████████████████████████▎                                                         | 46/90 [00:38<00:37,  1.17it/s][A
+ 52%|█████████████████████████████████████████████████████████████▌                                                        | 47/90 [00:39<00:37,  1.14it/s][A
+ 53%|██████████████████████████████████████████████████████████████▉                                                       | 48/90 [00:40<00:36,  1.17it/s][A
+ 54%|████████████████████████████████████████████████████████████████▏                                                     | 49/90 [00:41<00:35,  1.15it/s][A
+ 56%|█████████████████████████████████████████████████████████████████▌                                                    | 50/90 [00:42<00:34,  1.17it/s][A
+ 57%|██████████████████████████████████████████████████████████████████▊                                                   | 51/90 [00:42<00:33,  1.15it/s][A
+ 58%|████████████████████████████████████████████████████████████████████▏                                                 | 52/90 [00:43<00:32,  1.17it/s][A
+ 59%|█████████████████████████████████████████████████████████████████████▍                                                | 53/90 [00:44<00:32,  1.15it/s][A
+ 60%|██████████████████████████████████████████████████████████████████████▊                                               | 54/90 [00:45<00:30,  1.17it/s][A
+ 61%|████████████████████████████████████████████████████████████████████████                                              | 55/90 [00:46<00:30,  1.14it/s][A
+ 62%|█████████████████████████████████████████████████████████████████████████▍                                            | 56/90 [00:47<00:29,  1.16it/s][A
+ 63%|██████████████████████████████████████████████████████████████████████████▋                                           | 57/90 [00:48<00:28,  1.14it/s][A
+ 64%|████████████████████████████████████████████████████████████████████████████                                          | 58/90 [00:48<00:27,  1.16it/s][A
+ 66%|█████████████████████████████████████████████████████████████████████████████▎                                        | 59/90 [00:49<00:27,  1.14it/s][A
+ 67%|██████████████████████████████████████████████████████████████████████████████▋                                       | 60/90 [00:50<00:25,  1.16it/s][A
+ 68%|█████████████████████████████████████████████��█████████████████████████████████▉                                      | 61/90 [00:51<00:25,  1.14it/s][A
+ 69%|█████████████████████████████████████████████████████████████████████████████████▎                                    | 62/90 [00:52<00:24,  1.16it/s][A
+ 70%|██████████████████████████████████████████████████████████████████████████████████▌                                   | 63/90 [00:53<00:23,  1.13it/s][A
+ 71%|███████████████████████████████████████████████████████████████████████████████████▉                                  | 64/90 [00:54<00:22,  1.15it/s][A
+ 72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 65/90 [00:55<00:21,  1.14it/s][A
+ 73%|██████████████████████████████████████████████████████████████████████████████████████▌                               | 66/90 [00:55<00:20,  1.16it/s][A
+ 74%|███████████████████████████████████████████████████████████████████████████████████████▊                              | 67/90 [00:56<00:20,  1.14it/s][A
+ 76%|█████████████████████████████████████████████████████████████████████████████████████████▏                            | 68/90 [00:57<00:18,  1.17it/s][A
+ 77%|██████████████████████████████████████████████████████████████████████████████████████████▍                           | 69/90 [00:58<00:19,  1.06it/s][A
+ 78%|███████████████████████████████████████████████████████████████████████████████████████████▊                          | 70/90 [00:59<00:18,  1.11it/s][A
+ 79%|█████████████████████████████████████████████████████████████████████████████████████████████                         | 71/90 [01:00<00:17,  1.11it/s][A
+ 80%|██████████████████████████████████████████████████████████████████████████████████████████████▍                       | 72/90 [01:01<00:15,  1.14it/s][A
+ 81%|███████████████████████████████████████████████████████████████████████████████████████████████▋                      | 73/90 [01:02<00:14,  1.13it/s][A
+ 82%|█████████████████████████████████████████████████████████████████████████████████████████████████                     | 74/90 [01:03<00:13,  1.16it/s][A
+ 83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 75/90 [01:03<00:13,  1.13it/s][A
+ 84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 76/90 [01:04<00:12,  1.16it/s][A
+ 86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 77/90 [01:05<00:11,  1.13it/s][A
+ 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 78/90 [01:06<00:10,  1.16it/s][A
+ 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 79/90 [01:07<00:09,  1.13it/s][A
+ 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 80/90 [01:08<00:08,  1.16it/s][A
+ 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 81/90 [01:09<00:07,  1.14it/s][A
+ 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 82/90 [01:09<00:06,  1.16it/s][A
+ 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 83/90 [01:10<00:06,  1.14it/s][A
+ 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 84/90 [01:11<00:05,  1.16it/s][A
+ 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 85/90 [01:12<00:04,  1.13it/s][A
+ 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 86/90 [01:13<00:03,  1.16it/s][A
+ 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 87/90 [01:14<00:02,  1.15it/s][A
+ 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 88/90 [01:15<00:01,  1.17it/s][A
+ 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00,  1.15it/s][A
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:16<00:00,  1.16it/s][A                                                                                                                                                           
+                                                                                                                                                           [A{'eval_loss': 1.6886017322540283, 'eval_runtime': 79.9199, 'eval_samples_per_second': 9.134, 'eval_steps_per_second': 2.29, 'eval_ppl': 5.4119, 'memory/max_active (GiB)': 11.16, 'memory/max_allocated (GiB)': 5.19, 'memory/device_reserved (GiB)': 13.81, 'epoch': 0}
+  0%|                                                                                                                             | 0/2499 [01:23<?, ?it/s]
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00,  1.16it/s][A
+                                                                                                                                                           [A  0%|                                                                                                                  | 1/2499 [01:32<64:06:25, 92.39s/it]                                                                                                                                                           {'loss': 2.0336, 'grad_norm': 1.6855894327163696, 'learning_rate': 0.0, 'ppl': 7.6415, 'memory/max_active (GiB)': 16.07, 'memory/max_allocated (GiB)': 10.54, 'memory/device_reserved (GiB)': 18.02, 'tokens_per_second_per_gpu': 197269.0, 'total_tokens': 1298183, 'epoch': 0.0}
+  0%|                                                                                                                  | 1/2499 [01:32<64:06:25, 92.39s/it]  0%|                                                                                                                  | 2/2499 [01:38<28:57:02, 41.74s/it]                                                                                                                                                           {'loss': 1.7737, 'grad_norm': 1.1572575569152832, 'learning_rate': 2e-05, 'ppl': 5.8926, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.97, 'tokens_per_second_per_gpu': 4746.09, 'total_tokens': 1327999, 'epoch': 0.0}
+  0%|                                                                                                                  | 2/2499 [01:38<28:57:02, 41.74s/it]  0%|▏                                                                                                                 | 3/2499 [01:44<17:42:14, 25.53s/it]                                                                                                                                                           {'loss': 1.85, 'grad_norm': 1.594330072402954, 'learning_rate': 4e-05, 'ppl': 6.3598, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.99, 'tokens_per_second_per_gpu': 4622.21, 'total_tokens': 1356883, 'epoch': 0.0}
+  0%|▏                                                                                                                 | 3/2499 [01:44<17:42:14, 25.53s/it]  0%|▏                                                                                                                 | 4/2499 [01:51<12:25:28, 17.93s/it]                                                                                                                                                           {'loss': 1.6567, 'grad_norm': 1.557888150215149, 'learning_rate': 6e-05, 'ppl': 5.242, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.99, 'tokens_per_second_per_gpu': 4690.94, 'total_tokens': 1386261, 'epoch': 0.0}
+  0%|▏                                                                                                                 | 4/2499 [01:51<12:25:28, 17.93s/it]  0%|▏                                                                                                                  | 5/2499 [01:57<9:30:55, 13.74s/it]                                                                                                                                                           {'loss': 1.9046, 'grad_norm': 1.6567342281341553, 'learning_rate': 8e-05, 'ppl': 6.7167, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4507.97, 'total_tokens': 1414659, 'epoch': 0.01}
+  0%|▏                                                                                                                  | 5/2499 [01:57<9:30:55, 13.74s/it]  0%|▎                                                                                                                  | 6/2499 [02:03<7:45:45, 11.21s/it]                                                                                                                                                           {'loss': 1.8432, 'grad_norm': 1.6043676137924194, 'learning_rate': 0.0001, 'ppl': 6.3167, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4597.87, 'total_tokens': 1443641, 'epoch': 0.01}
+  0%|▎                                                                                                                  | 6/2499 [02:03<7:45:45, 11.21s/it]  0%|▎                                                                                                                  | 7/2499 [02:10<6:38:32,  9.60s/it]                                                                                                                                                           {'loss': 1.492, 'grad_norm': 1.2741687297821045, 'learning_rate': 0.00012, 'ppl': 4.446, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4542.53, 'total_tokens': 1472125, 'epoch': 0.01}
+  0%|▎                                                                                                                  | 7/2499 [02:10<6:38:32,  9.60s/it]  0%|▎                                                                                                                  | 8/2499 [02:16<5:54:25,  8.54s/it]                                                                                                                                                           {'loss': 1.4809, 'grad_norm': 1.3272074460983276, 'learning_rate': 0.00014, 'ppl': 4.3969, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4670.47, 'total_tokens': 1501396, 'epoch': 0.01}
+  0%|▎                                                                                                                  | 8/2499 [02:16<5:54:25,  8.54s/it]  0%|▍                                                                                                                  | 9/2499 [02:22<5:24:48,  7.83s/it]                                                                                                                                                           {'loss': 1.238, 'grad_norm': 1.0670270919799805, 'learning_rate': 0.00016, 'ppl': 3.4487, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4547.02, 'total_tokens': 1529874, 'epoch': 0.01}
+  0%|▍                                                                                                                  | 9/2499 [02:22<5:24:48,  7.83s/it]  0%|▍                                                                                                                 | 10/2499 [02:28<5:04:46,  7.35s/it]                                                                                                                                                           {'loss': 1.2017, 'grad_norm': 0.9426001906394958, 'learning_rate': 0.00018, 'ppl': 3.3258, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4686.7, 'total_tokens': 1559258, 'epoch': 0.01}
+  0%|▍                                                                                                                 | 10/2499 [02:28<5:04:46,  7.35s/it]  0%|▌                                                                                                                 | 11/2499 [02:35<4:51:01,  7.02s/it]                                                                                                                                                           {'loss': 1.1605, 'grad_norm': 0.8342238664627075, 'learning_rate': 0.0002, 'ppl': 3.1915, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4637.04, 'total_tokens': 1588337, 'epoch': 0.01}
+  0%|▌                                                                                                                 | 11/2499 [02:35<4:51:01,  7.02s/it]  0%|▌                                                                                                                 | 12/2499 [02:41<4:42:06,  6.81s/it]                                                                                                                                                           {'loss': 1.2037, 'grad_norm': 0.9213444590568542, 'learning_rate': 0.00019999992034374237, 'ppl': 3.3324, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4643.51, 'total_tokens': 1617675, 'epoch': 0.01}
+  0%|▌                                                                                                                 | 12/2499 [02:41<4:42:06,  6.81s/it]  1%|▌                                                                                                                 | 13/2499 [02:47<4:35:33,  6.65s/it]                                                                                                                                                           {'loss': 1.0463, 'grad_norm': 0.5648354887962341, 'learning_rate': 0.0001999996813750963, 'ppl': 2.8471, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4691.68, 'total_tokens': 1647182, 'epoch': 0.02}
+  1%|▌                                                                                                                 | 13/2499 [02:47<4:35:33,  6.65s/it]  1%|▋                                                                                                                 | 14/2499 [02:54<4:30:45,  6.54s/it]                                                                                                                                                           {'loss': 1.0009, 'grad_norm': 0.4093482494354248, 'learning_rate': 0.0001999992830944426, 'ppl': 2.7207, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4583.72, 'total_tokens': 1675932, 'epoch': 0.02}
+  1%|▋                                                                                                                 | 14/2499 [02:54<4:30:45,  6.54s/it]  1%|▋                                                                                                                 | 15/2499 [03:00<4:27:25,  6.46s/it]                                                                                                                                                           {'loss': 1.0439, 'grad_norm': 0.6911133527755737, 'learning_rate': 0.0001999987255024157, 'ppl': 2.8403, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4700.02, 'total_tokens': 1705435, 'epoch': 0.02}
+  1%|▋                                                                                                                 | 15/2499 [03:00<4:27:25,  6.46s/it]  1%|▋                                                                                                                 | 16/2499 [03:06<4:24:59,  6.40s/it]                                                                                                                                                           {'loss': 1.0052, 'grad_norm': 0.647537112236023, 'learning_rate': 0.0001999980085999039, 'ppl': 2.7325, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4640.77, 'total_tokens': 1734534, 'epoch': 0.02}
+  1%|▋                                                                                                                 | 16/2499 [03:06<4:24:59,  6.40s/it]  1%|▊                                                                                                                 | 17/2499 [03:12<4:23:03,  6.36s/it]                                                                                                                                                           {'loss': 0.8606, 'grad_norm': 0.24260607361793518, 'learning_rate': 0.0001999971323880494, 'ppl': 2.3646, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4574.84, 'total_tokens': 1763148, 'epoch': 0.02}
+  1%|▊                                                                                                                 | 17/2499 [03:12<4:23:03,  6.36s/it]  1%|▊                                                                                                                 | 18/2499 [03:19<4:21:50,  6.33s/it]                                                                                                                                                           {'loss': 0.9237, 'grad_norm': 0.34218189120292664, 'learning_rate': 0.00019999609686824802, 'ppl': 2.5186, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4659.05, 'total_tokens': 1792345, 'epoch': 0.02}
+  1%|▊                                                                                                                 | 18/2499 [03:19<4:21:50,  6.33s/it]  1%|▊                                                                                                                 | 19/2499 [03:25<4:21:38,  6.33s/it]                                                                                                                                                           {'loss': 0.8695, 'grad_norm': 0.6931776404380798, 'learning_rate': 0.00019999490204214958, 'ppl': 2.3857, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4737.23, 'total_tokens': 1822296, 'epoch': 0.02}
+  1%|▊                                                                                                                 | 19/2499 [03:25<4:21:38,  6.33s/it]  1%|▉                                                                                                                 | 20/2499 [03:31<4:21:01,  6.32s/it]                                                                                                                                                           {'loss': 0.8121, 'grad_norm': 0.29975464940071106, 'learning_rate': 0.00019999354791165749, 'ppl': 2.2526, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4681.06, 'total_tokens': 1851717, 'epoch': 0.02}
+  1%|▉                                                                                                                 | 20/2499 [03:31<4:21:01,  6.32s/it]  1%|▉                                                                                                                 | 21/2499 [03:38<4:20:42,  6.31s/it]                                                                                                                                                           {'loss': 0.8624, 'grad_norm': 0.25352585315704346, 'learning_rate': 0.0001999920344789291, 'ppl': 2.3688, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4727.48, 'total_tokens': 1881491, 'epoch': 0.03}
+  1%|▉                                                                                                                 | 21/2499 [03:38<4:20:42,  6.31s/it]  1%|█                                                                                                                 | 22/2499 [03:44<4:20:30,  6.31s/it]                                                                                                                                                           {'loss': 0.8812, 'grad_norm': 0.3950115144252777, 'learning_rate': 0.00019999036174637546, 'ppl': 2.4138, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4758.76, 'total_tokens': 1911483, 'epoch': 0.03}
+  1%|█                                                                                                                 | 22/2499 [03:44<4:20:30,  6.31s/it]  1%|█                                                                                                                 | 23/2499 [03:50<4:19:54,  6.30s/it]                                                                                                                                                           {'loss': 0.819, 'grad_norm': 0.24307860434055328, 'learning_rate': 0.0001999885297166615, 'ppl': 2.2682, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4582.96, 'total_tokens': 1940207, 'epoch': 0.03}
+  1%|█                                                                                                                 | 23/2499 [03:50<4:19:54,  6.30s/it]  1%|█                                                                                                                 | 24/2499 [03:56<4:19:14,  6.28s/it]                                                                                                                                                           {'loss': 0.7642, 'grad_norm': 0.17830020189285278, 'learning_rate': 0.00019998653839270583, 'ppl': 2.1473, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4526.68, 'total_tokens': 1968501, 'epoch': 0.03}
+  1%|█                                                                                                                 | 24/2499 [03:56<4:19:14,  6.28s/it]  1%|█▏                                                                                                                | 25/2499 [04:03<4:19:12,  6.29s/it]                                                                                                                                                           {'loss': 0.7952, 'grad_norm': 0.1788649559020996, 'learning_rate': 0.0001999843877776809, 'ppl': 2.2149, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4562.49, 'total_tokens': 1997194, 'epoch': 0.03}
+  1%|█▏                                                                                                                | 25/2499 [04:03<4:19:12,  6.29s/it]  1%|█▏                                                                                                                | 26/2499 [04:09<4:19:12,  6.29s/it]                                                                                                                                                           {'loss': 0.8073, 'grad_norm': 0.24912691116333008, 'learning_rate': 0.00019998207787501286, 'ppl': 2.2418, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4474.8, 'total_tokens': 2025344, 'epoch': 0.03}
+  1%|█▏                                                                                                                | 26/2499 [04:09<4:19:12,  6.29s/it]  1%|█▏                                                                                                                | 27/2499 [04:15<4:18:55,  6.28s/it]                                                                                                                                                           {'loss': 0.7831, 'grad_norm': 0.21209484338760376, 'learning_rate': 0.00019997960868838174, 'ppl': 2.1882, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4661.31, 'total_tokens': 2054571, 'epoch': 0.03}
+  1%|█▏                                                                                                                | 27/2499 [04:15<4:18:55,  6.28s/it]  1%|█▎                                                                                                                | 28/2499 [04:21<4:18:38,  6.28s/it]                                                                                                                                                           {'loss': 0.7746, 'grad_norm': 0.216914564371109, 'learning_rate': 0.0001999769802217212, 'ppl': 2.1697, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4733.21, 'total_tokens': 2084241, 'epoch': 0.03}
+  1%|█▎                                                                                                                | 28/2499 [04:21<4:18:38,  6.28s/it]  1%|█▎                                                                                                                | 29/2499 [04:28<4:18:15,  6.27s/it]                                                                                                                                                           {'loss': 0.8475, 'grad_norm': 0.207558274269104, 'learning_rate': 0.0001999741924792188, 'ppl': 2.3338, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4546.06, 'total_tokens': 2112679, 'epoch': 0.03}
+  1%|█▎                                                                                                                | 29/2499 [04:28<4:18:15,  6.27s/it]  1%|█▎                                                                                                                | 30/2499 [04:34<4:17:50,  6.27s/it]                                                                                                                                                           {'loss': 0.7692, 'grad_norm': 0.21438081562519073, 'learning_rate': 0.0001999712454653157, 'ppl': 2.158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4561.33, 'total_tokens': 2141169, 'epoch': 0.04}
+  1%|█▎                                                                                                                | 30/2499 [04:34<4:17:50,  6.27s/it]  1%|█▍                                                                                                                | 31/2499 [04:40<4:17:31,  6.26s/it]                                                                                                                                                           {'loss': 0.7869, 'grad_norm': 0.16961662471294403, 'learning_rate': 0.00019996813918470686, 'ppl': 2.1966, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4358.55, 'total_tokens': 2168390, 'epoch': 0.04}
+  1%|█▍                                                                                                                | 31/2499 [04:40<4:17:31,  6.26s/it]  1%|█▍                                                                                                                | 32/2499 [04:47<4:17:50,  6.27s/it]                                                                                                                                                           {'loss': 0.8634, 'grad_norm': 0.18904076516628265, 'learning_rate': 0.000199964873642341, 'ppl': 2.3712, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4506.81, 'total_tokens': 2196748, 'epoch': 0.04}
+  1%|█▍                                                                                                                | 32/2499 [04:47<4:17:50,  6.27s/it]  1%|█▌                                                                                                                | 33/2499 [04:53<4:18:20,  6.29s/it]                                                                                                                                                           {'loss': 0.7711, 'grad_norm': 0.16406087577342987, 'learning_rate': 0.0001999614488434205, 'ppl': 2.1621, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4605.35, 'total_tokens': 2225839, 'epoch': 0.04}
+  1%|█▌                                                                                                                | 33/2499 [04:53<4:18:20,  6.29s/it]  1%|█▌                                                                                                                | 34/2499 [04:59<4:18:37,  6.30s/it]                                                                                                                                                           {'loss': 0.7774, 'grad_norm': 0.15022194385528564, 'learning_rate': 0.00019995786479340156, 'ppl': 2.1758, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4747.45, 'total_tokens': 2255812, 'epoch': 0.04}
+  1%|█▌                                                                                                                | 34/2499 [04:59<4:18:37,  6.30s/it]  1%|█▌                                                                                                                | 35/2499 [05:05<4:18:16,  6.29s/it]                                                                                                                                                           {'loss': 0.6847, 'grad_norm': 0.11543940007686615, 'learning_rate': 0.00019995412149799395, 'ppl': 1.9832, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4510.07, 'total_tokens': 2284100, 'epoch': 0.04}
+  1%|█▌                                                                                                                | 35/2499 [05:05<4:18:16,  6.29s/it]  1%|█▋                                                                                                                | 36/2499 [05:12<4:17:51,  6.28s/it]                                                                                                                                                           {'loss': 0.7787, 'grad_norm': 0.1670907884836197, 'learning_rate': 0.00019995021896316128, 'ppl': 2.1786, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4492.67, 'total_tokens': 2312230, 'epoch': 0.04}
+  1%|█▋                                                                                                                | 36/2499 [05:12<4:17:51,  6.28s/it]  1%|█▋                                                                                                                | 37/2499 [05:18<4:17:29,  6.28s/it]                                                                                                                                                           {'loss': 0.7615, 'grad_norm': 0.19045475125312805, 'learning_rate': 0.00019994615719512072, 'ppl': 2.1415, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4537.58, 'total_tokens': 2340625, 'epoch': 0.04}
+  1%|█▋                                                                                                                | 37/2499 [05:18<4:17:29,  6.28s/it]  2%|█▋                                                                                                                | 38/2499 [05:24<4:17:12,  6.27s/it]                                                                                                                                                           {'loss': 0.841, 'grad_norm': 0.13640637695789337, 'learning_rate': 0.00019994193620034314, 'ppl': 2.3187, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4534.95, 'total_tokens': 2369006, 'epoch': 0.05}
+  2%|█▋                                                                                                                | 38/2499 [05:24<4:17:12,  6.27s/it]  2%|█▊                                                                                                                | 39/2499 [05:31<4:17:41,  6.28s/it]                                                                                                                                                           {'loss': 0.8279, 'grad_norm': 0.15840484201908112, 'learning_rate': 0.00019993755598555322, 'ppl': 2.2885, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4723.1, 'total_tokens': 2398832, 'epoch': 0.05}
+  2%|█▊                                                                                                                | 39/2499 [05:31<4:17:41,  6.28s/it]  2%|█▊                                                                                                                | 40/2499 [05:37<4:17:47,  6.29s/it]                                                                                                                                                           {'loss': 0.6928, 'grad_norm': 0.13987034559249878, 'learning_rate': 0.0001999330165577291, 'ppl': 1.9993, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4510.37, 'total_tokens': 2427243, 'epoch': 0.05}
+  2%|█▊                                                                                                                | 40/2499 [05:37<4:17:47,  6.29s/it]  2%|█▊                                                                                                                | 41/2499 [05:43<4:17:35,  6.29s/it]                                                                                                                                                           {'loss': 0.7248, 'grad_norm': 0.13921092450618744, 'learning_rate': 0.00019992831792410272, 'ppl': 2.0643, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4685.94, 'total_tokens': 2456661, 'epoch': 0.05}
+  2%|█▊                                                                                                                | 41/2499 [05:43<4:17:35,  6.29s/it]  2%|█▉                                                                                                                | 42/2499 [05:49<4:17:18,  6.28s/it]                                                                                                                                                           {'loss': 0.7406, 'grad_norm': 0.12492494285106659, 'learning_rate': 0.0001999234600921595, 'ppl': 2.0972, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4612.0, 'total_tokens': 2485581, 'epoch': 0.05}
+  2%|█▉                                                                                                                | 42/2499 [05:49<4:17:18,  6.28s/it]  2%|█▉                                                                                                                | 43/2499 [05:56<4:17:02,  6.28s/it]                                                                                                                                                           {'loss': 0.7535, 'grad_norm': 0.12467890232801437, 'learning_rate': 0.00019991844306963872, 'ppl': 2.1244, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4568.82, 'total_tokens': 2514221, 'epoch': 0.05}
+  2%|█▉                                                                                                                | 43/2499 [05:56<4:17:02,  6.28s/it]  2%|██                                                                                                                | 44/2499 [06:02<4:16:41,  6.27s/it]                                                                                                                                                           {'loss': 0.7356, 'grad_norm': 0.1306881159543991, 'learning_rate': 0.000199913266864533, 'ppl': 2.0867, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4386.29, 'total_tokens': 2541665, 'epoch': 0.05}
+  2%|██                                                                                                                | 44/2499 [06:02<4:16:41,  6.27s/it]  2%|██                                                                                                                | 45/2499 [06:08<4:16:23,  6.27s/it]                                                                                                                                                           {'loss': 0.7163, 'grad_norm': 0.1349906027317047, 'learning_rate': 0.0001999079314850887, 'ppl': 2.0468, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4404.7, 'total_tokens': 2569218, 'epoch': 0.05}
+  2%|██                                                                                                                | 45/2499 [06:08<4:16:23,  6.27s/it]  2%|██                                                                                                                | 46/2499 [06:15<4:16:55,  6.28s/it]                                                                                                                                                           {'loss': 0.6931, 'grad_norm': 0.14203360676765442, 'learning_rate': 0.0001999024369398058, 'ppl': 1.9999, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4625.96, 'total_tokens': 2598443, 'epoch': 0.06}
+  2%|██                                                                                                                | 46/2499 [06:15<4:16:55,  6.28s/it]  2%|██▏                                                                                                               | 47/2499 [06:21<4:17:25,  6.30s/it]                                                                                                                                                           {'loss': 0.7034, 'grad_norm': 0.1235819086432457, 'learning_rate': 0.00019989678323743774, 'ppl': 2.0206, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4575.29, 'total_tokens': 2627402, 'epoch': 0.06}
+  2%|██▏                                                                                                               | 47/2499 [06:21<4:17:25,  6.30s/it]  2%|██▏                                                                                                               | 48/2499 [06:27<4:17:16,  6.30s/it]                                                                                                                                                           {'loss': 0.7176, 'grad_norm': 0.14084498584270477, 'learning_rate': 0.00019989097038699164, 'ppl': 2.0495, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4732.23, 'total_tokens': 2657177, 'epoch': 0.06}
+  2%|██▏                                                                                                               | 48/2499 [06:27<4:17:16,  6.30s/it]  2%|██▏                                                                                                               | 49/2499 [06:33<4:17:13,  6.30s/it]                                                                                                                                                           {'loss': 0.7038, 'grad_norm': 0.12469019740819931, 'learning_rate': 0.00019988499839772804, 'ppl': 2.0214, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4607.81, 'total_tokens': 2686207, 'epoch': 0.06}
+  2%|██▏                                                                                                               | 49/2499 [06:33<4:17:13,  6.30s/it]  2%|██▎                                                                                                               | 50/2499 [06:40<4:16:45,  6.29s/it]                                                                                                                                                           {'loss': 0.6652, 'grad_norm': 0.12172164767980576, 'learning_rate': 0.0001998788672791611, 'ppl': 1.9449, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4587.16, 'total_tokens': 2714952, 'epoch': 0.06}
+  2%|██▎                                                                                                               | 50/2499 [06:40<4:16:45,  6.29s/it]  2%|██▎                                                                                                               | 51/2499 [06:46<4:16:32,  6.29s/it]                                                                                                                                                           {'loss': 0.7439, 'grad_norm': 0.12937241792678833, 'learning_rate': 0.00019987257704105844, 'ppl': 2.1041, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4660.7, 'total_tokens': 2744218, 'epoch': 0.06}
+  2%|██▎                                                                                                               | 51/2499 [06:46<4:16:32,  6.29s/it]  2%|██▎                                                                                                               | 52/2499 [06:52<4:16:41,  6.29s/it]                                                                                                                                                           {'loss': 0.7108, 'grad_norm': 0.1375284045934677, 'learning_rate': 0.0001998661276934412, 'ppl': 2.0356, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4450.95, 'total_tokens': 2772286, 'epoch': 0.06}
+  2%|██▎                                                                                                               | 52/2499 [06:52<4:16:41,  6.29s/it]  2%|██▍                                                                                                               | 53/2499 [06:59<4:16:42,  6.30s/it]                                                                                                                                                           {'loss': 0.7404, 'grad_norm': 0.12681901454925537, 'learning_rate': 0.000199859519246584, 'ppl': 2.0968, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4493.7, 'total_tokens': 2800601, 'epoch': 0.06}
+  2%|██▍                                                                                                               | 53/2499 [06:59<4:16:42,  6.30s/it]  2%|██▍                                                                                                               | 54/2499 [07:05<4:16:46,  6.30s/it]                                                                                                                                                           {'loss': 0.7666, 'grad_norm': 0.1492014229297638, 'learning_rate': 0.00019985275171101495, 'ppl': 2.1524, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4497.77, 'total_tokens': 2828969, 'epoch': 0.06}
+  2%|██▍                                                                                                               | 54/2499 [07:05<4:16:46,  6.30s/it]  2%|██▌                                                                                                               | 55/2499 [07:11<4:16:50,  6.31s/it]                                                                                                                                                           {'loss': 0.723, 'grad_norm': 0.12260715663433075, 'learning_rate': 0.00019984582509751552, 'ppl': 2.0606, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4470.31, 'total_tokens': 2857188, 'epoch': 0.07}
+  2%|██▌                                                                                                               | 55/2499 [07:11<4:16:50,  6.31s/it]  2%|██▌                                                                                                               | 56/2499 [07:18<4:16:52,  6.31s/it]                                                                                                                                                           {'loss': 0.7153, 'grad_norm': 0.1511772871017456, 'learning_rate': 0.00019983873941712072, 'ppl': 2.0448, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4408.94, 'total_tokens': 2885021, 'epoch': 0.07}
+  2%|██▌                                                                                                               | 56/2499 [07:18<4:16:52,  6.31s/it]  2%|██▌                                                                                                               | 57/2499 [07:24<4:16:36,  6.31s/it]                                                                                                                                                           {'loss': 0.6686, 'grad_norm': 0.12503519654273987, 'learning_rate': 0.00019983149468111894, 'ppl': 1.9515, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4573.97, 'total_tokens': 2913799, 'epoch': 0.07}
+  2%|██▌                                                                                                               | 57/2499 [07:24<4:16:36,  6.31s/it]  2%|██▋                                                                                                               | 58/2499 [07:30<4:16:27,  6.30s/it]                                                                                                                                                           {'loss': 0.7522, 'grad_norm': 0.12792782485485077, 'learning_rate': 0.0001998240909010519, 'ppl': 2.1217, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4703.97, 'total_tokens': 2943426, 'epoch': 0.07}
+  2%|██▋                                                                                                               | 58/2499 [07:30<4:16:27,  6.30s/it]  2%|██▋                                                                                                               | 59/2499 [07:36<4:16:22,  6.30s/it]                                                                                                                                                           {'loss': 0.6882, 'grad_norm': 0.12937703728675842, 'learning_rate': 0.00019981652808871475, 'ppl': 1.9901, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4699.04, 'total_tokens': 2973043, 'epoch': 0.07}
+  2%|██▋                                                                                                               | 59/2499 [07:36<4:16:22,  6.30s/it]  2%|██▋                                                                                                               | 60/2499 [07:43<4:16:07,  6.30s/it]                                                                                                                                                           {'loss': 0.6602, 'grad_norm': 0.12878933548927307, 'learning_rate': 0.00019980880625615604, 'ppl': 1.9352, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4500.91, 'total_tokens': 3001352, 'epoch': 0.07}
+  2%|██▋                                                                                                               | 60/2499 [07:43<4:16:07,  6.30s/it]  2%|██▊                                                                                                               | 61/2499 [07:49<4:15:36,  6.29s/it]                                                                                                                                                           {'loss': 0.6685, 'grad_norm': 0.13316965103149414, 'learning_rate': 0.00019980092541567763, 'ppl': 1.9513, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4517.16, 'total_tokens': 3029652, 'epoch': 0.07}
+  2%|██▊                                                                                                               | 61/2499 [07:49<4:15:36,  6.29s/it]  2%|██▊                                                                                                               | 62/2499 [07:55<4:15:45,  6.30s/it]                                                                                                                                                           {'loss': 0.7736, 'grad_norm': 0.12971599400043488, 'learning_rate': 0.0001997928855798346, 'ppl': 2.1676, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4444.53, 'total_tokens': 3057692, 'epoch': 0.07}
+  2%|██▊                                                                                                               | 62/2499 [07:55<4:15:45,  6.30s/it]  3%|██▊                                                                                                               | 63/2499 [08:02<4:15:43,  6.30s/it]                                                                                                                                                           {'loss': 0.6215, 'grad_norm': 0.11753156036138535, 'learning_rate': 0.0001997846867614355, 'ppl': 1.8617, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4651.6, 'total_tokens': 3086990, 'epoch': 0.08}
+  3%|██▊                                                                                                               | 63/2499 [08:02<4:15:43,  6.30s/it]  3%|██▉                                                                                                               | 64/2499 [08:08<4:15:42,  6.30s/it]                                                                                                                                                           {'loss': 0.6703, 'grad_norm': 0.14658862352371216, 'learning_rate': 0.00019977632897354202, 'ppl': 1.9548, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4746.31, 'total_tokens': 3116909, 'epoch': 0.08}
+  3%|██▉                                                                                                               | 64/2499 [08:08<4:15:42,  6.30s/it]  3%|██▉                                                                                                               | 65/2499 [08:14<4:15:28,  6.30s/it]                                                                                                                                                           {'loss': 0.6798, 'grad_norm': 0.12969624996185303, 'learning_rate': 0.00019976781222946918, 'ppl': 1.9735, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4652.71, 'total_tokens': 3146161, 'epoch': 0.08}
+  3%|██▉                                                                                                               | 65/2499 [08:14<4:15:28,  6.30s/it]  3%|███                                                                                                               | 66/2499 [08:21<4:15:23,  6.30s/it]                                                                                                                                                           {'loss': 0.6765, 'grad_norm': 0.20642466843128204, 'learning_rate': 0.00019975913654278525, 'ppl': 1.967, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4720.42, 'total_tokens': 3175889, 'epoch': 0.08}
+  3%|███                                                                                                               | 66/2499 [08:21<4:15:23,  6.30s/it]  3%|███                                                                                                               | 67/2499 [08:27<4:14:54,  6.29s/it]                                                                                                                                                           {'loss': 0.6657, 'grad_norm': 0.12067057937383652, 'learning_rate': 0.0001997503019273116, 'ppl': 1.9459, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4506.21, 'total_tokens': 3204118, 'epoch': 0.08}
+  3%|███                                                                                                               | 67/2499 [08:27<4:14:54,  6.29s/it]  3%|███                                                                                                               | 68/2499 [08:33<4:14:44,  6.29s/it]                                                                                                                                                           {'loss': 0.6175, 'grad_norm': 0.12278411537408829, 'learning_rate': 0.000199741308397123, 'ppl': 1.8543, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4567.1, 'total_tokens': 3232803, 'epoch': 0.08}
+  3%|███                                                                                                               | 68/2499 [08:33<4:14:44,  6.29s/it]  3%|███▏                                                                                                              | 69/2499 [08:39<4:15:04,  6.30s/it]                                                                                                                                                           {'loss': 0.6619, 'grad_norm': 0.13150422275066376, 'learning_rate': 0.00019973215596654715, 'ppl': 1.9385, 'memory/max_active (GiB)': 17.82, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4495.32, 'total_tokens': 3261216, 'epoch': 0.08}
+  3%|███▏                                                                                                              | 69/2499 [08:39<4:15:04,  6.30s/it]  3%|███▏                                                                                                              | 70/2499 [08:46<4:14:55,  6.30s/it]                                                                                                                                                           {'loss': 0.7218, 'grad_norm': 0.1392705738544464, 'learning_rate': 0.0001997228446501651, 'ppl': 2.0581, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4586.02, 'total_tokens': 3290070, 'epoch': 0.08}
+  3%|███▏                                                                                                              | 70/2499 [08:46<4:14:55,  6.30s/it]  3%|███▏                                                                                                              | 71/2499 [08:52<4:14:32,  6.29s/it]                                                                                                                                                           {'loss': 0.7086, 'grad_norm': 0.15434479713439941, 'learning_rate': 0.00019971337446281087, 'ppl': 2.0311, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4580.13, 'total_tokens': 3318793, 'epoch': 0.09}
+  3%|███▏                                                                                                              | 71/2499 [08:52<4:14:32,  6.29s/it]  3%|███▎                                                                                                              | 72/2499 [08:58<4:14:10,  6.28s/it]                                                                                                                                                           {'loss': 0.7222, 'grad_norm': 0.1450231820344925, 'learning_rate': 0.00019970374541957174, 'ppl': 2.059, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4561.96, 'total_tokens': 3347382, 'epoch': 0.09}
+  3%|███▎                                                                                                              | 72/2499 [08:58<4:14:10,  6.28s/it]  3%|███▎                                                                                                              | 73/2499 [09:05<4:14:10,  6.29s/it]                                                                                                                                                           {'loss': 0.6646, 'grad_norm': 0.14817385375499725, 'learning_rate': 0.00019969395753578794, 'ppl': 1.9437, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4675.51, 'total_tokens': 3376788, 'epoch': 0.09}
+  3%|███▎                                                                                                              | 73/2499 [09:05<4:14:10,  6.29s/it]  3%|███▍                                                                                                              | 74/2499 [09:11<4:13:54,  6.28s/it]                                                                                                                                                           {'loss': 0.6898, 'grad_norm': 0.131875678896904, 'learning_rate': 0.00019968401082705276, 'ppl': 1.9933, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4524.14, 'total_tokens': 3405160, 'epoch': 0.09}
+  3%|███▍                                                                                                              | 74/2499 [09:11<4:13:54,  6.28s/it]  3%|███▍                                                                                                              | 75/2499 [09:17<4:14:00,  6.29s/it]                                                                                                                                                           {'loss': 0.6869, 'grad_norm': 0.1403125524520874, 'learning_rate': 0.0001996739053092126, 'ppl': 1.9875, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4577.87, 'total_tokens': 3433985, 'epoch': 0.09}
+  3%|███▍                                                                                                              | 75/2499 [09:17<4:14:00,  6.29s/it]  3%|███▍                                                                                                              | 76/2499 [09:23<4:14:26,  6.30s/it]                                                                                                                                                           {'loss': 0.6758, 'grad_norm': 0.137966588139534, 'learning_rate': 0.00019966364099836681, 'ppl': 1.9656, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4609.11, 'total_tokens': 3463148, 'epoch': 0.09}
+  3%|███▍                                                                                                              | 76/2499 [09:23<4:14:26,  6.30s/it]  3%|███▌                                                                                                              | 77/2499 [09:30<4:14:24,  6.30s/it]                                                                                                                                                           {'loss': 0.6669, 'grad_norm': 0.13154162466526031, 'learning_rate': 0.00019965321791086768, 'ppl': 1.9482, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4764.08, 'total_tokens': 3493170, 'epoch': 0.09}
+  3%|███▌                                                                                                              | 77/2499 [09:30<4:14:24,  6.30s/it]  3%|███▌                                                                                                              | 78/2499 [09:36<4:13:47,  6.29s/it]                                                                                                                                                           {'loss': 0.6681, 'grad_norm': 0.1396287977695465, 'learning_rate': 0.00019964263606332051, 'ppl': 1.9505, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4335.49, 'total_tokens': 3520301, 'epoch': 0.09}
+  3%|███▌                                                                                                              | 78/2499 [09:36<4:13:47,  6.29s/it]  3%|███▌                                                                                                              | 79/2499 [09:42<4:13:27,  6.28s/it]                                                                                                                                                           {'loss': 0.6799, 'grad_norm': 0.1356486976146698, 'learning_rate': 0.00019963189547258356, 'ppl': 1.9737, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4451.59, 'total_tokens': 3548202, 'epoch': 0.09}
+  3%|███▌                                                                                                              | 79/2499 [09:42<4:13:27,  6.28s/it]  3%|███▋                                                                                                              | 80/2499 [09:49<4:13:19,  6.28s/it]                                                                                                                                                           {'loss': 0.6697, 'grad_norm': 0.14252781867980957, 'learning_rate': 0.0001996209961557679, 'ppl': 1.9537, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4500.44, 'total_tokens': 3576462, 'epoch': 0.1}
+  3%|███▋                                                                                                              | 80/2499 [09:49<4:13:19,  6.28s/it]  3%|███▋                                                                                                              | 81/2499 [09:55<4:13:09,  6.28s/it]                                                                                                                                                           {'loss': 0.7155, 'grad_norm': 0.14615966379642487, 'learning_rate': 0.00019960993813023745, 'ppl': 2.0452, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4544.14, 'total_tokens': 3604983, 'epoch': 0.1}
+  3%|███▋                                                                                                              | 81/2499 [09:55<4:13:09,  6.28s/it]  3%|███▋                                                                                                              | 82/2499 [10:01<4:13:23,  6.29s/it]                                                                                                                                                           {'loss': 0.6172, 'grad_norm': 0.13786305487155914, 'learning_rate': 0.0001995987214136091, 'ppl': 1.8537, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4671.06, 'total_tokens': 3634442, 'epoch': 0.1}
+  3%|███▋                                                                                                              | 82/2499 [10:01<4:13:23,  6.29s/it]  3%|███▊                                                                                                              | 83/2499 [10:07<4:13:31,  6.30s/it]                                                                                                                                                           {'loss': 0.6399, 'grad_norm': 0.14883151650428772, 'learning_rate': 0.00019958734602375247, 'ppl': 1.8963, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4347.61, 'total_tokens': 3661862, 'epoch': 0.1}
+  3%|███▊                                                                                                              | 83/2499 [10:07<4:13:31,  6.30s/it]  3%|███▊                                                                                                              | 84/2499 [10:14<4:13:26,  6.30s/it]                                                                                                                                                           {'loss': 0.6619, 'grad_norm': 0.1344694346189499, 'learning_rate': 0.00019957581197878996, 'ppl': 1.9385, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4744.18, 'total_tokens': 3691718, 'epoch': 0.1}
+  3%|███▊                                                                                                              | 84/2499 [10:14<4:13:26,  6.30s/it]  3%|███▉                                                                                                              | 85/2499 [10:20<4:13:08,  6.29s/it]                                                                                                                                                           {'loss': 0.7284, 'grad_norm': 0.12591156363487244, 'learning_rate': 0.00019956411929709678, 'ppl': 2.0718, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4638.44, 'total_tokens': 3720842, 'epoch': 0.1}
+  3%|███▉                                                                                                              | 85/2499 [10:20<4:13:08,  6.29s/it]  3%|███▉                                                                                                              | 86/2499 [10:26<4:12:45,  6.29s/it]                                                                                                                                                           {'loss': 0.6502, 'grad_norm': 0.1308436542749405, 'learning_rate': 0.00019955226799730081, 'ppl': 1.9159, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4529.63, 'total_tokens': 3749228, 'epoch': 0.1}
+  3%|███▉                                                                                                              | 86/2499 [10:26<4:12:45,  6.29s/it]  3%|███▉                                                                                                              | 87/2499 [10:33<4:12:38,  6.28s/it]                                                                                                                                                           {'loss': 0.6609, 'grad_norm': 0.13323400914669037, 'learning_rate': 0.00019954025809828266, 'ppl': 1.9365, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4566.95, 'total_tokens': 3777912, 'epoch': 0.1}
+  3%|███▉                                                                                                              | 87/2499 [10:33<4:12:38,  6.28s/it]  4%|████                                                                                                              | 88/2499 [10:39<4:12:35,  6.29s/it]                                                                                                                                                           {'loss': 0.6446, 'grad_norm': 0.16586028039455414, 'learning_rate': 0.00019952808961917558, 'ppl': 1.9052, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4553.1, 'total_tokens': 3806539, 'epoch': 0.11}
+  4%|████                                                                                                              | 88/2499 [10:39<4:12:35,  6.29s/it]  4%|████                                                                                                              | 89/2499 [10:45<4:12:53,  6.30s/it]                                                                                                                                                           {'loss': 0.6663, 'grad_norm': 0.14273381233215332, 'learning_rate': 0.0001995157625793655, 'ppl': 1.947, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4543.06, 'total_tokens': 3835239, 'epoch': 0.11}
+  4%|████                                                                                                              | 89/2499 [10:45<4:12:53,  6.30s/it]  4%|████                                                                                                              | 90/2499 [10:51<4:12:49,  6.30s/it]                                                                                                                                                           {'loss': 0.6725, 'grad_norm': 0.15345992147922516, 'learning_rate': 0.00019950327699849098, 'ppl': 1.9591, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4596.19, 'total_tokens': 3864175, 'epoch': 0.11}
+  4%|████                                                                                                              | 90/2499 [10:52<4:12:49,  6.30s/it]  4%|████▏                                                                                                             | 91/2499 [10:58<4:12:38,  6.29s/it]                                                                                                                                                           {'loss': 0.7163, 'grad_norm': 0.16092751920223236, 'learning_rate': 0.00019949063289644302, 'ppl': 2.0468, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4597.41, 'total_tokens': 3893079, 'epoch': 0.11}
+  4%|████▏                                                                                                             | 91/2499 [10:58<4:12:38,  6.29s/it]  4%|████▏                                                                                                             | 92/2499 [11:04<4:12:21,  6.29s/it]                                                                                                                                                           {'loss': 0.6764, 'grad_norm': 0.13062061369419098, 'learning_rate': 0.00019947783029336533, 'ppl': 1.9668, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4599.4, 'total_tokens': 3921954, 'epoch': 0.11}
+  4%|████▏                                                                                                             | 92/2499 [11:04<4:12:21,  6.29s/it]  4%|████▏                                                                                                             | 93/2499 [11:10<4:12:13,  6.29s/it]                                                                                                                                                           {'loss': 0.6585, 'grad_norm': 0.14627501368522644, 'learning_rate': 0.00019946486920965404, 'ppl': 1.9319, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4612.44, 'total_tokens': 3950949, 'epoch': 0.11}
+  4%|████▏                                                                                                             | 93/2499 [11:10<4:12:13,  6.29s/it]  4%|████▎                                                                                                             | 94/2499 [11:17<4:12:09,  6.29s/it]                                                                                                                                                           {'loss': 0.6825, 'grad_norm': 0.14802932739257812, 'learning_rate': 0.00019945174966595777, 'ppl': 1.9788, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4644.39, 'total_tokens': 3980160, 'epoch': 0.11}
+  4%|████▎                                                                                                             | 94/2499 [11:17<4:12:09,  6.29s/it]  4%|████▎                                                                                                             | 95/2499 [11:23<4:11:43,  6.28s/it]                                                                                                                                                           {'loss': 0.6535, 'grad_norm': 0.151302307844162, 'learning_rate': 0.0001994384716831776, 'ppl': 1.9223, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4503.85, 'total_tokens': 4008363, 'epoch': 0.11}
+  4%|████▎                                                                                                             | 95/2499 [11:23<4:11:43,  6.28s/it]  4%|████▍                                                                                                             | 96/2499 [11:29<4:12:15,  6.30s/it]                                                                                                                                                           {'loss': 0.6404, 'grad_norm': 0.15178830921649933, 'learning_rate': 0.000199425035282467, 'ppl': 1.8972, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4664.08, 'total_tokens': 4037899, 'epoch': 0.12}
+  4%|████▍                                                                                                             | 96/2499 [11:29<4:12:15,  6.30s/it]  4%|████▍                                                                                                             | 97/2499 [11:36<4:12:23,  6.30s/it]                                                                                                                                                           {'loss': 0.7097, 'grad_norm': 0.1457069218158722, 'learning_rate': 0.0001994114404852319, 'ppl': 2.0334, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4667.4, 'total_tokens': 4067373, 'epoch': 0.12}
+  4%|████▍                                                                                                             | 97/2499 [11:36<4:12:23,  6.30s/it]  4%|████▍                                                                                                             | 98/2499 [11:42<4:11:56,  6.30s/it]                                                                                                                                                           {'loss': 0.6538, 'grad_norm': 0.13825637102127075, 'learning_rate': 0.00019939768731313046, 'ppl': 1.9228, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4609.94, 'total_tokens': 4096295, 'epoch': 0.12}
+  4%|████▍                                                                                                             | 98/2499 [11:42<4:11:56,  6.30s/it]  4%|████▌                                                                                                             | 99/2499 [11:48<4:11:49,  6.30s/it]                                                                                                                                                           {'loss': 0.6082, 'grad_norm': 0.14136871695518494, 'learning_rate': 0.00019938377578807318, 'ppl': 1.8371, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4684.31, 'total_tokens': 4125771, 'epoch': 0.12}
+  4%|████▌                                                                                                             | 99/2499 [11:48<4:11:49,  6.30s/it]  4%|████▌                                                                                                            | 100/2499 [11:54<4:11:44,  6.30s/it]                                                                                                                                                           {'loss': 0.6605, 'grad_norm': 0.1564965695142746, 'learning_rate': 0.0001993697059322229, 'ppl': 1.9358, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4555.83, 'total_tokens': 4154447, 'epoch': 0.12}
+  4%|████▌                                                                                                            | 100/2499 [11:54<4:11:44,  6.30s/it][2025-12-28 11:17:31,070] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:17:32,807] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8711647987365723
+[2025-12-28 11:17:33,641] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8331155776977539
+[2025-12-28 11:17:34,487] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8463048934936523
+[2025-12-28 11:17:35,331] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8430600166320801
+[2025-12-28 11:17:35,331] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+  0%|                                                                                                                               | 0/90 [00:00<?, ?it/s][A
+  2%|██▋                                                                                                                    | 2/90 [00:00<00:36,  2.42it/s][A
+  3%|███▉                                                                                                                   | 3/90 [00:01<00:55,  1.57it/s][A
+  4%|█████▎                                                                                                                 | 4/90 [00:02<01:00,  1.42it/s][A
+  6%|██████▌                                                                                                                | 5/90 [00:03<01:06,  1.28it/s][A
+  7%|███████▉                                                                                                               | 6/90 [00:04<01:06,  1.26it/s][A
+  8%|█████████▎                                                                                                             | 7/90 [00:05<01:10,  1.17it/s][A
+  9%|██████████▌                                                                                                            | 8/90 [00:06<01:09,  1.19it/s][A
+ 10%|███████████▉                                                                                                           | 9/90 [00:07<01:10,  1.15it/s][A
+ 11%|█████████████                                                                                                         | 10/90 [00:07<01:08,  1.18it/s][A
+ 12%|██████████████▍                                                                                                       | 11/90 [00:08<01:09,  1.14it/s][A
+ 13%|███████████████▋                                                                                                      | 12/90 [00:09<01:06,  1.17it/s][A
+ 14%|█��███████████████                                                                                                     | 13/90 [00:10<01:07,  1.13it/s][A
+ 16%|██████████████████▎                                                                                                   | 14/90 [00:11<01:05,  1.16it/s][A
+ 17%|███████████████████▋                                                                                                  | 15/90 [00:12<01:05,  1.14it/s][A
+ 18%|████████████████████▉                                                                                                 | 16/90 [00:13<01:03,  1.16it/s][A
+ 19%|██████████████████████▎                                                                                               | 17/90 [00:14<01:04,  1.13it/s][A
+ 20%|███████████████████████▌                                                                                              | 18/90 [00:14<01:01,  1.17it/s][A
+ 21%|████████████████████████▉                                                                                             | 19/90 [00:15<01:01,  1.15it/s][A
+ 22%|██████████████████████████▏                                                                                           | 20/90 [00:16<00:59,  1.17it/s][A
+ 23%|███████████████████████████▌                                                                                          | 21/90 [00:17<01:00,  1.14it/s][A
+ 24%|████████████████████████████▊                                                                                         | 22/90 [00:18<00:58,  1.17it/s][A
+ 26%|██████████████████████████████▏                                                                                       | 23/90 [00:19<00:58,  1.15it/s][A
+ 27%|███████████████████████████████▍                                                                                      | 24/90 [00:20<00:56,  1.17it/s][A
+ 28%|████████████████████████████████▊                                                                                     | 25/90 [00:20<00:56,  1.15it/s][A
+ 29%|██████████████████████████████████                                                                                    | 26/90 [00:21<00:54,  1.17it/s][A
+ 30%|███████████████████████████████████▍                                                                                  | 27/90 [00:22<00:54,  1.15it/s][A
+ 31%|████████████████████████████████████▋                                                                                 | 28/90 [00:23<00:52,  1.17it/s][A
+ 32%|██████████████████████████████████████                                                                                | 29/90 [00:24<00:53,  1.15it/s][A
+ 33%|███████████████████████████████████████▎                                                                              | 30/90 [00:25<00:51,  1.17it/s][A
+ 34%|████████████████████████████████████████▋                                                                             | 31/90 [00:26<00:51,  1.15it/s][A
+ 36%|█████████████████████████████████████████▉                                                                            | 32/90 [00:26<00:49,  1.17it/s][A
+ 37%|███████████████████████████████████████████▎                                                                          | 33/90 [00:27<00:49,  1.15it/s][A
+ 38%|████████████████████████████████████████████▌                                                                         | 34/90 [00:28<00:47,  1.17it/s][A
+ 39%|█████████████████████████████████████████████▉                                                                        | 35/90 [00:29<00:47,  1.15it/s][A
+ 40%|███████████████████████████████████████████████▏                                                                      | 36/90 [00:30<00:45,  1.17it/s][A
+ 41%|████████████████████████████████████████████████▌                                                                     | 37/90 [00:31<00:45,  1.15it/s][A
+ 42%|█████████████████████████████████████████████████▊                                                                    | 38/90 [00:32<00:44,  1.17it/s][A
+ 43%|███████████████████████████████████████████████████▏                                                                  | 39/90 [00:32<00:44,  1.15it/s][A
+ 44%|████████████████████████████████████████████████████▍                                                                 | 40/90 [00:33<00:42,  1.17it/s][A
+ 46%|█████████████████████████████████████████████████████▊                                                                | 41/90 [00:34<00:42,  1.15it/s][A
+ 47%|███████████████████████████████████████████████████████                                                               | 42/90 [00:35<00:40,  1.17it/s][A
+ 48%|████████████████████████████████████████████████████████▍                                                             | 43/90 [00:36<00:40,  1.15it/s][A
+ 49%|█████████████████████████████████████████████████████████▋                                                            | 44/90 [00:37<00:39,  1.17it/s][A
+ 50%|███████████████████████████████████████████████████████████                                                           | 45/90 [00:38<00:38,  1.15it/s][A
+ 51%|████████████████████████████████████████████████████████████▎                                                         | 46/90 [00:38<00:37,  1.17it/s][A
+ 52%|█████████████████████████████████████████████████████████████▌                                                        | 47/90 [00:39<00:37,  1.16it/s][A
+ 53%|██████████████████████████████████████████████████████████████▉                                                       | 48/90 [00:40<00:35,  1.17it/s][A
+ 54%|████████████████████████████████████████████████████████████████▏                                                     | 49/90 [00:41<00:35,  1.15it/s][A
+ 56%|█████████████████████████████████████████████████████████████████▌                                                    | 50/90 [00:42<00:34,  1.18it/s][A
+ 57%|██████████████████████████████████████████████████████████████████▊                                                   | 51/90 [00:43<00:33,  1.16it/s][A
+ 58%|████████████████████████████████████████████████████████████████████▏                                                 | 52/90 [00:44<00:32,  1.17it/s][A
+ 59%|█████████████████████████████████████████████████████████████████████▍                                                | 53/90 [00:44<00:32,  1.15it/s][A
+ 60%|█████████████████████���████████████████████████████████████████████████▊                                               | 54/90 [00:45<00:30,  1.17it/s][A
+ 61%|████████████████████████████████████████████████████████████████████████                                              | 55/90 [00:46<00:30,  1.15it/s][A
+ 62%|█████████████████████████████████████████████████████████████████████████▍                                            | 56/90 [00:47<00:28,  1.17it/s][A
+ 63%|██████████████████████████████████████████████████████████████████████████▋                                           | 57/90 [00:48<00:28,  1.14it/s][A
+ 64%|████████████████████████████████████████████████████████████████████████████                                          | 58/90 [00:49<00:27,  1.17it/s][A
+ 66%|█████████████████████████████████████████████████████████████████████████████▎                                        | 59/90 [00:50<00:27,  1.13it/s][A
+ 67%|██████████████████████████████████████████████████████████████████████████████▋                                       | 60/90 [00:51<00:25,  1.17it/s][A
+ 68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 61/90 [00:51<00:25,  1.13it/s][A
+ 69%|█████████████████████████████████████████████████████████████████████████████████▎                                    | 62/90 [00:52<00:24,  1.16it/s][A
+ 70%|██████████████████████████████████████████████████████████████████████████████████▌                                   | 63/90 [00:53<00:23,  1.13it/s][A
+ 71%|███████████████████████████████████████████████████████████████████████████████████▉                                  | 64/90 [00:54<00:22,  1.16it/s][A
+ 72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 65/90 [00:55<00:22,  1.13it/s][A
+ 73%|██████████████████████████████████████████████████████████████████████████████████████▌                               | 66/90 [00:56<00:20,  1.16it/s][A
+ 74%|███████████████████████████████████████████████████████████████████████████████████████▊                              | 67/90 [00:57<00:20,  1.13it/s][A
+ 76%|█████████████████████████████████████████████████████████████████████████████████████████▏                            | 68/90 [00:57<00:18,  1.16it/s][A
+ 77%|██████████████████████████████████████████████████████████████████████████████████████████▍                           | 69/90 [00:58<00:18,  1.14it/s][A
+ 78%|█████���█████████████████████████████████████████████████████████████████████████████████████▊                          | 70/90 [00:59<00:17,  1.16it/s][A
+ 79%|█████████████████████████████████████████████████████████████████████████████████████████████                         | 71/90 [01:00<00:16,  1.15it/s][A
+ 80%|██████████████████████████████████████████████████████████████████████████████████████████████▍                       | 72/90 [01:01<00:15,  1.17it/s][A
+ 81%|███████████████████████████████████████████████████████████████████████████████████████████████▋                      | 73/90 [01:03<00:19,  1.14s/it][A
+ 82%|█████████████████████████████████████████████████████████████████████████████████████████████████                     | 74/90 [01:04<00:16,  1.03s/it][A
+ 83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 75/90 [01:04<00:14,  1.01it/s][A
+ 84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 76/90 [01:05<00:13,  1.07it/s][A
+ 86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 77/90 [01:06<00:12,  1.08it/s][A
+ 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 78/90 [01:07<00:10,  1.12it/s][A
+ 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 79/90 [01:08<00:09,  1.12it/s][A
+ 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 80/90 [01:09<00:08,  1.15it/s][A
+ 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 81/90 [01:10<00:07,  1.14it/s][A
+ 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 82/90 [01:10<00:06,  1.16it/s][A
+ 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 83/90 [01:11<00:06,  1.14it/s][A
+ 93%|█████████████████████���████████████████████████████████████████████████████████████████████████████████████████▏       | 84/90 [01:12<00:05,  1.16it/s][A
+ 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 85/90 [01:13<00:04,  1.15it/s][A
+ 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 86/90 [01:14<00:03,  1.17it/s][A
+ 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 87/90 [01:15<00:02,  1.15it/s][A
+ 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 88/90 [01:16<00:01,  1.17it/s][A
+ 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00,  1.16it/s][A
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00,  1.14it/s][A                                                                                                                                                           
+                                                                                                                                                           [A{'eval_loss': 0.6468729376792908, 'eval_runtime': 79.9715, 'eval_samples_per_second': 9.128, 'eval_steps_per_second': 2.288, 'eval_ppl': 1.9096, 'memory/max_active (GiB)': 12.83, 'memory/max_allocated (GiB)': 6.85, 'memory/device_reserved (GiB)': 20.07, 'epoch': 0.12}
+  4%|████▌                                                                                                            | 100/2499 [13:19<4:11:44,  6.30s/it]
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:18<00:00,  1.14it/s][A
+                                                                                                                                                           [A  4%|████▌                                                                                                           | 101/2499 [13:25<21:01:55, 31.57s/it]                                                                                                                                                           {'loss': 0.6184, 'grad_norm': 0.17828112840652466, 'learning_rate': 0.00019935547776799467, 'ppl': 1.856, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4614.55, 'total_tokens': 5453331, 'epoch': 0.12}
+  4%|████▌                                                                                                           | 101/2499 [13:25<21:01:55, 31.57s/it]  4%|████▌                                                                                                           | 102/2499 [13:31<15:58:35, 23.99s/it]                                                                                                                                                           {'loss': 0.6822, 'grad_norm': 0.2011706829071045, 'learning_rate': 0.00019934109131805575, 'ppl': 1.9782, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4637.52, 'total_tokens': 5482579, 'epoch': 0.12}
+  4%|████▌                                                                                                           | 102/2499 [13:31<15:58:35, 23.99s/it]  4%|████▌                                                                                                           | 103/2499 [13:38<12:26:29, 18.69s/it]                                                                                                                                                           {'loss': 0.5284, 'grad_norm': 0.13656415045261383, 'learning_rate': 0.00019932654660532548, 'ppl': 1.6962, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4598.33, 'total_tokens': 5511638, 'epoch': 0.12}
+  4%|████▌                                                                                                           | 103/2499 [13:38<12:26:29, 18.69s/it]  4%|████▋                                                                                                            | 104/2499 [13:44<9:57:26, 14.97s/it]                                                                                                                                                           {'loss': 0.6585, 'grad_norm': 0.15870781242847443, 'learning_rate': 0.0001993118436529755, 'ppl': 1.9319, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4352.34, 'total_tokens': 5538918, 'epoch': 0.12}
+  4%|████▋                                                                                                            | 104/2499 [13:44<9:57:26, 14.97s/it]  4%|████▋                                                                                                            | 105/2499 [13:50<8:13:25, 12.37s/it]                                                                                                                                                           {'loss': 0.6339, 'grad_norm': 0.14072741568088531, 'learning_rate': 0.00019929698248442938, 'ppl': 1.8849, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4732.02, 'total_tokens': 5568710, 'epoch': 0.13}
+  4%|████▋                                                                                                            | 105/2499 [13:50<8:13:25, 12.37s/it]  4%|████▊                                                                                                            | 106/2499 [13:56<7:00:22, 10.54s/it]                                                                                                                                                           {'loss': 0.6381, 'grad_norm': 0.14659491181373596, 'learning_rate': 0.00019928196312336285, 'ppl': 1.8929, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4575.0, 'total_tokens': 5597423, 'epoch': 0.13}
+  4%|████▊                                                                                                            | 106/2499 [13:56<7:00:22, 10.54s/it]  4%|████▊                                                                                                            | 107/2499 [14:03<6:09:10,  9.26s/it]                                                                                                                                                           {'loss': 0.6897, 'grad_norm': 0.1409890204668045, 'learning_rate': 0.00019926678559370364, 'ppl': 1.9931, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4498.1, 'total_tokens': 5625629, 'epoch': 0.13}
+  4%|████▊                                                                                                            | 107/2499 [14:03<6:09:10,  9.26s/it]  4%|████▉                                                                                                            | 108/2499 [14:09<5:33:17,  8.36s/it]                                                                                                                                                           {'loss': 0.5941, 'grad_norm': 0.1351788341999054, 'learning_rate': 0.00019925144991963145, 'ppl': 1.8114, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4593.3, 'total_tokens': 5654426, 'epoch': 0.13}
+  4%|████▉                                                                                                            | 108/2499 [14:09<5:33:17,  8.36s/it]  4%|████▉                                                                                                            | 109/2499 [14:15<5:08:46,  7.75s/it]                                                                                                                                                           {'loss': 0.6293, 'grad_norm': 0.1541460007429123, 'learning_rate': 0.00019923595612557793, 'ppl': 1.8763, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4634.55, 'total_tokens': 5683721, 'epoch': 0.13}
+  4%|████▉                                                                                                            | 109/2499 [14:15<5:08:46,  7.75s/it]  4%|████▉                                                                                                            | 110/2499 [14:22<4:51:31,  7.32s/it]                                                                                                                                                           {'loss': 0.6673, 'grad_norm': 0.17826059460639954, 'learning_rate': 0.0001992203042362266, 'ppl': 1.949, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4305.69, 'total_tokens': 5710908, 'epoch': 0.13}
+  4%|████▉                                                                                                            | 110/2499 [14:22<4:51:31,  7.32s/it]  4%|█████                                                                                                            | 111/2499 [14:28<4:39:24,  7.02s/it]                                                                                                                                                           {'loss': 0.7005, 'grad_norm': 0.14798669517040253, 'learning_rate': 0.00019920449427651292, 'ppl': 2.0148, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4490.72, 'total_tokens': 5739262, 'epoch': 0.13}
+  4%|█████                                                                                                            | 111/2499 [14:28<4:39:24,  7.02s/it]  4%|█████                                                                                                            | 112/2499 [14:34<4:30:29,  6.80s/it]                                                                                                                                                           {'loss': 0.701, 'grad_norm': 0.14876116812229156, 'learning_rate': 0.00019918852627162412, 'ppl': 2.0158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4544.41, 'total_tokens': 5767800, 'epoch': 0.13}
+  4%|█████                                                                                                            | 112/2499 [14:34<4:30:29,  6.80s/it]  5%|█████                                                                                                            | 113/2499 [14:41<4:24:02,  6.64s/it]                                                                                                                                                           {'loss': 0.6515, 'grad_norm': 0.14015726745128632, 'learning_rate': 0.00019917240024699924, 'ppl': 1.9184, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4582.81, 'total_tokens': 5796516, 'epoch': 0.14}
+  5%|█████                                                                                                            | 113/2499 [14:41<4:24:02,  6.64s/it]  5%|█████▏                                                                                                           | 114/2499 [14:47<4:19:38,  6.53s/it]                                                                                                                                                           {'loss': 0.6357, 'grad_norm': 0.14569461345672607, 'learning_rate': 0.00019915611622832905, 'ppl': 1.8883, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4596.84, 'total_tokens': 5825374, 'epoch': 0.14}
+  5%|█████▏                                                                                                           | 114/2499 [14:47<4:19:38,  6.53s/it]  5%|█████▏                                                                                                           | 115/2499 [14:53<4:16:34,  6.46s/it]                                                                                                                                                           {'loss': 0.6666, 'grad_norm': 0.1522768884897232, 'learning_rate': 0.00019913967424155598, 'ppl': 1.9476, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4636.01, 'total_tokens': 5854490, 'epoch': 0.14}
+  5%|█████▏                                                                                                           | 115/2499 [14:53<4:16:34,  6.46s/it]  5%|█████▏                                                                                                           | 116/2499 [14:59<4:14:25,  6.41s/it]                                                                                                                                                           {'loss': 0.6558, 'grad_norm': 0.15072417259216309, 'learning_rate': 0.00019912307431287427, 'ppl': 1.9267, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4478.85, 'total_tokens': 5882638, 'epoch': 0.14}
+  5%|█████▏                                                                                                           | 116/2499 [14:59<4:14:25,  6.41s/it]  5%|█████▎                                                                                                           | 117/2499 [15:06<4:13:09,  6.38s/it]                                                                                                                                                           {'loss': 0.6541, 'grad_norm': 0.140936478972435, 'learning_rate': 0.0001991063164687296, 'ppl': 1.9234, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4528.6, 'total_tokens': 5911187, 'epoch': 0.14}
+  5%|█████▎                                                                                                           | 117/2499 [15:06<4:13:09,  6.38s/it]  5%|█████▎                                                                                                           | 118/2499 [15:12<4:12:05,  6.35s/it]                                                                                                                                                           {'loss': 0.6191, 'grad_norm': 0.14590787887573242, 'learning_rate': 0.00019908940073581937, 'ppl': 1.8573, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4668.98, 'total_tokens': 5940567, 'epoch': 0.14}
+  5%|█████▎                                                                                                           | 118/2499 [15:12<4:12:05,  6.35s/it]  5%|█████▍                                                                                                           | 119/2499 [15:18<4:11:21,  6.34s/it]                                                                                                                                                           {'loss': 0.6365, 'grad_norm': 0.13646982610225677, 'learning_rate': 0.0001990723271410924, 'ppl': 1.8899, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4828.06, 'total_tokens': 5970969, 'epoch': 0.14}
+  5%|█████▍                                                                                                           | 119/2499 [15:18<4:11:21,  6.34s/it]  5%|█████▍                                                                                                           | 120/2499 [15:25<4:10:27,  6.32s/it]                                                                                                                                                           {'loss': 0.5822, 'grad_norm': 0.1353752613067627, 'learning_rate': 0.00019905509571174914, 'ppl': 1.79, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4639.33, 'total_tokens': 6000051, 'epoch': 0.14}
+  5%|█████▍                                                                                                           | 120/2499 [15:25<4:10:27,  6.32s/it]  5%|█████▍                                                                                                           | 121/2499 [15:31<4:09:58,  6.31s/it]                                                                                                                                                           {'loss': 0.6289, 'grad_norm': 0.17556677758693695, 'learning_rate': 0.00019903770647524137, 'ppl': 1.8755, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4626.75, 'total_tokens': 6029115, 'epoch': 0.15}
+  5%|█████▍                                                                                                           | 121/2499 [15:31<4:09:58,  6.31s/it]  5%|██��██▌                                                                                                           | 122/2499 [15:37<4:09:14,  6.29s/it]                                                                                                                                                           {'loss': 0.6331, 'grad_norm': 0.1434057652950287, 'learning_rate': 0.0001990201594592723, 'ppl': 1.8834, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4451.39, 'total_tokens': 6056947, 'epoch': 0.15}
+  5%|█████▌                                                                                                           | 122/2499 [15:37<4:09:14,  6.29s/it]  5%|█████▌                                                                                                           | 123/2499 [15:43<4:09:19,  6.30s/it]                                                                                                                                                           {'loss': 0.6185, 'grad_norm': 0.14586731791496277, 'learning_rate': 0.00019900245469179655, 'ppl': 1.8561, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4505.08, 'total_tokens': 6085351, 'epoch': 0.15}
+  5%|█████▌                                                                                                           | 123/2499 [15:43<4:09:19,  6.30s/it]  5%|█████▌                                                                                                           | 124/2499 [15:50<4:09:00,  6.29s/it]                                                                                                                                                           {'loss': 0.6336, 'grad_norm': 0.15855848789215088, 'learning_rate': 0.00019898459220102002, 'ppl': 1.8844, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4453.23, 'total_tokens': 6113294, 'epoch': 0.15}
+  5%|█████▌                                                                                                           | 124/2499 [15:50<4:09:00,  6.29s/it]  5%|█████▋                                                                                                           | 125/2499 [15:56<4:08:35,  6.28s/it]                                                                                                                                                           {'loss': 0.6083, 'grad_norm': 0.14481675624847412, 'learning_rate': 0.0001989665720153999, 'ppl': 1.8373, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4599.51, 'total_tokens': 6142097, 'epoch': 0.15}
+  5%|█████▋                                                                                                           | 125/2499 [15:56<4:08:35,  6.28s/it]  5%|█████▋                                                                                                           | 126/2499 [16:02<4:08:12,  6.28s/it]                                                                                                                                                           {'loss': 0.6727, 'grad_norm': 0.167931467294693, 'learning_rate': 0.0001989483941636446, 'ppl': 1.9595, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4386.44, 'total_tokens': 6169540, 'epoch': 0.15}
+  5%|█████▋                                                                                                           | 126/2499 [16:02<4:08:12,  6.28s/it]  5%|█████▋                                                                                                           | 127/2499 [16:08<4:08:05,  6.28s/it]                                                                                                                                                           {'loss': 0.601, 'grad_norm': 0.155978262424469, 'learning_rate': 0.00019893005867471374, 'ppl': 1.8239, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4730.58, 'total_tokens': 6199215, 'epoch': 0.15}
+  5%|█████▋                                                                                                           | 127/2499 [16:08<4:08:05,  6.28s/it]  5%|█████▊                                                                                                           | 128/2499 [16:15<4:07:48,  6.27s/it]                                                                                                                                                           {'loss': 0.6443, 'grad_norm': 0.1500401645898819, 'learning_rate': 0.00019891156557781797, 'ppl': 1.9047, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4510.45, 'total_tokens': 6227443, 'epoch': 0.15}
+  5%|█████▊                                                                                                           | 128/2499 [16:15<4:07:48,  6.27s/it]  5%|█████▊                                                                                                           | 129/2499 [16:21<4:07:33,  6.27s/it]                                                                                                                                                           {'loss': 0.6555, 'grad_norm': 0.15343204140663147, 'learning_rate': 0.0001988929149024192, 'ppl': 1.9261, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4447.09, 'total_tokens': 6255262, 'epoch': 0.15}
+  5%|█████▊                                                                                                           | 129/2499 [16:21<4:07:33,  6.27s/it]  5%|█████▉                                                                                                           | 130/2499 [16:27<4:07:53,  6.28s/it]                                                                                                                                                           {'loss': 0.6536, 'grad_norm': 0.18412944674491882, 'learning_rate': 0.00019887410667823022, 'ppl': 1.9224, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4639.29, 'total_tokens': 6284499, 'epoch': 0.16}
+  5%|█████▉                                                                                                           | 130/2499 [16:27<4:07:53,  6.28s/it]  5%|█████▉                                                                                                           | 131/2499 [16:34<4:07:47,  6.28s/it]                                                                                                                                                           {'loss': 0.6874, 'grad_norm': 6.614463806152344, 'learning_rate': 0.00019885514093521495, 'ppl': 1.9885, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4495.37, 'total_tokens': 6312705, 'epoch': 0.16}
+  5%|█████▉                                                                                                           | 131/2499 [16:34<4:07:47,  6.28s/it]  5%|█████▉                                                                                                           | 132/2499 [16:40<4:07:36,  6.28s/it]                                                                                                                                                           {'loss': 0.6402, 'grad_norm': 0.1778506189584732, 'learning_rate': 0.0001988360177035881, 'ppl': 1.8969, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4504.67, 'total_tokens': 6340952, 'epoch': 0.16}
+  5%|█████▉                                                                                                           | 132/2499 [16:40<4:07:36,  6.28s/it]  5%|██████                                                                                                           | 133/2499 [16:46<4:07:13,  6.27s/it]                                                                                                                                                           {'loss': 0.5849, 'grad_norm': 0.15809500217437744, 'learning_rate': 0.00019881673701381547, 'ppl': 1.7948, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4387.74, 'total_tokens': 6368377, 'epoch': 0.16}
+  5%|██████                                                                                                           | 133/2499 [16:46<4:07:13,  6.27s/it]  5%|██████                                                                                                           | 134/2499 [16:52<4:07:04,  6.27s/it]                                                                                                                                                           {'loss': 0.5755, 'grad_norm': 0.16758741438388824, 'learning_rate': 0.00019879729889661353, 'ppl': 1.778, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4713.74, 'total_tokens': 6397901, 'epoch': 0.16}
+  5%|██████                                                                                                           | 134/2499 [16:52<4:07:04,  6.27s/it]  5%|██���███                                                                                                           | 135/2499 [16:59<4:06:57,  6.27s/it]                                                                                                                                                           {'loss': 0.6093, 'grad_norm': 0.17591319978237152, 'learning_rate': 0.00019877770338294973, 'ppl': 1.8391, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4636.12, 'total_tokens': 6426945, 'epoch': 0.16}
+  5%|██████                                                                                                           | 135/2499 [16:59<4:06:57,  6.27s/it]  5%|██████▏                                                                                                          | 136/2499 [17:05<4:06:54,  6.27s/it]                                                                                                                                                           {'loss': 0.6427, 'grad_norm': 0.18837158381938934, 'learning_rate': 0.0001987579505040421, 'ppl': 1.9016, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4433.54, 'total_tokens': 6454744, 'epoch': 0.16}
+  5%|██████▏                                                                                                          | 136/2499 [17:05<4:06:54,  6.27s/it]  5%|██████▏                                                                                                          | 137/2499 [17:11<4:07:08,  6.28s/it]                                                                                                                                                           {'loss': 0.6579, 'grad_norm': 0.1512988954782486, 'learning_rate': 0.00019873804029135942, 'ppl': 1.9307, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4501.54, 'total_tokens': 6483079, 'epoch': 0.16}
+  5%|██████▏                                                                                                          | 137/2499 [17:11<4:07:08,  6.28s/it]  6%|██████▏                                                                                                          | 138/2499 [17:17<4:06:52,  6.27s/it]                                                                                                                                                           {'loss': 0.6406, 'grad_norm': 0.1809886246919632, 'learning_rate': 0.00019871797277662125, 'ppl': 1.8976, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4511.87, 'total_tokens': 6511327, 'epoch': 0.17}
+  6%|██████▏                                                                                                          | 138/2499 [17:17<4:06:52,  6.27s/it]  6%|██████▎                                                                                                          | 139/2499 [17:24<4:07:01,  6.28s/it]                                                                                                                                                           {'loss': 0.6779, 'grad_norm': 0.1574440598487854, 'learning_rate': 0.00019869774799179755, 'ppl': 1.9697, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4721.62, 'total_tokens': 6541034, 'epoch': 0.17}
+  6%|██████▎                                                                                                          | 139/2499 [17:24<4:07:01,  6.28s/it]  6%|██████▎                                                                                                          | 140/2499 [17:30<4:06:55,  6.28s/it]                                                                                                                                                           {'loss': 0.6388, 'grad_norm': 0.16212943196296692, 'learning_rate': 0.00019867736596910902, 'ppl': 1.8942, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4727.92, 'total_tokens': 6570721, 'epoch': 0.17}
+  6%|██████▎                                                                                                          | 140/2499 [17:30<4:06:55,  6.28s/it]  6%|██████▍                                                                                                          | 141/2499 [17:36<4:06:44,  6.28s/it]                                                                                                                                                           {'loss': 0.6901, 'grad_norm': 0.16586321592330933, 'learning_rate': 0.00019865682674102676, 'ppl': 1.9939, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4663.45, 'total_tokens': 6599963, 'epoch': 0.17}
+  6%|██████▍                                                                                                          | 141/2499 [17:36<4:06:44,  6.28s/it]  6%|██████▍                                                                                                          | 142/2499 [17:43<4:06:23,  6.27s/it]                                                                                                                                                           {'loss': 0.6483, 'grad_norm': 0.1520916223526001, 'learning_rate': 0.00019863613034027224, 'ppl': 1.9123, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4568.36, 'total_tokens': 6628544, 'epoch': 0.17}
+  6%|██████▍                                                                                                          | 142/2499 [17:43<4:06:23,  6.27s/it]  6%|██████▍                                                                                                          | 143/2499 [17:49<4:07:00,  6.29s/it]                                                                                                                                                           {'loss': 0.6739, 'grad_norm': 0.17079249024391174, 'learning_rate': 0.00019861527679981752, 'ppl': 1.9619, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4642.42, 'total_tokens': 6657935, 'epoch': 0.17}
+  6%|██████▍                                                                                                          | 143/2499 [17:49<4:07:00,  6.29s/it]  6%|██████▌                                                                                                          | 144/2499 [17:55<4:07:16,  6.30s/it]                                                                                                                                                           {'loss': 0.6213, 'grad_norm': 0.14469042420387268, 'learning_rate': 0.00019859426615288488, 'ppl': 1.8613, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4454.93, 'total_tokens': 6686079, 'epoch': 0.17}
+  6%|██████▌                                                                                                          | 144/2499 [17:55<4:07:16,  6.30s/it]  6%|██████▌                                                                                                          | 145/2499 [18:02<4:06:50,  6.29s/it]                                                                                                                                                           {'loss': 0.6334, 'grad_norm': 0.15830209851264954, 'learning_rate': 0.00019857309843294684, 'ppl': 1.884, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4440.19, 'total_tokens': 6713910, 'epoch': 0.17}
+  6%|██████▌                                                                                                          | 145/2499 [18:02<4:06:50,  6.29s/it]  6%|██████▌                                                                                                          | 146/2499 [18:08<4:06:32,  6.29s/it]                                                                                                                                                           {'loss': 0.6419, 'grad_norm': 0.15467514097690582, 'learning_rate': 0.00019855177367372634, 'ppl': 1.9001, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4476.41, 'total_tokens': 6741989, 'epoch': 0.18}
+  6%|██████▌                                                                                                          | 146/2499 [18:08<4:06:32,  6.29s/it]  6%|██████▋                                                                                                          | 147/2499 [18:14<4:06:15,  6.28s/it]                                                                                                                                                           {'loss': 0.6124, 'grad_norm': 0.14238551259040833, 'learning_rate': 0.0001985302919091963, 'ppl': 1.8449, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4580.51, 'total_tokens': 6770703, 'epoch': 0.18}
+  6%|██████▋                                                                                                          | 147/2499 [18:14<4:06:15,  6.28s/it]  6%|██████▋                                                                                                          | 148/2499 [18:20<4:06:06,  6.28s/it]                                                                                                                                                           {'loss': 0.6293, 'grad_norm': 0.16102945804595947, 'learning_rate': 0.00019850865317357988, 'ppl': 1.8763, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4631.61, 'total_tokens': 6799769, 'epoch': 0.18}
+  6%|██████▋                                                                                                          | 148/2499 [18:20<4:06:06,  6.28s/it]  6%|██████▋                                                                                                          | 149/2499 [18:27<4:05:48,  6.28s/it]                                                                                                                                                           {'loss': 0.6808, 'grad_norm': 0.1688845455646515, 'learning_rate': 0.00019848685750135033, 'ppl': 1.9755, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4488.35, 'total_tokens': 6827878, 'epoch': 0.18}
+  6%|██████▋                                                                                                          | 149/2499 [18:27<4:05:48,  6.28s/it]  6%|██████▊                                                                                                          | 150/2499 [18:33<4:05:57,  6.28s/it]                                                                                                                                                           {'loss': 0.6459, 'grad_norm': 0.14278124272823334, 'learning_rate': 0.00019846490492723084, 'ppl': 1.9077, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4585.29, 'total_tokens': 6856742, 'epoch': 0.18}
+  6%|██████▊                                                                                                          | 150/2499 [18:33<4:05:57,  6.28s/it]  6%|██████▊                                                                                                          | 151/2499 [18:39<4:06:23,  6.30s/it]                                                                                                                                                           {'loss': 0.6847, 'grad_norm': 0.1538703888654709, 'learning_rate': 0.0001984427954861946, 'ppl': 1.9832, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4651.06, 'total_tokens': 6886155, 'epoch': 0.18}
+  6%|██████▊                                                                                                          | 151/2499 [18:39<4:06:23,  6.30s/it]  6%|██████▊                                                                                                          | 152/2499 [18:46<4:06:17,  6.30s/it]                                                                                                                                                           {'loss': 0.6242, 'grad_norm': 0.15251557528972626, 'learning_rate': 0.00019842052921346479, 'ppl': 1.8668, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4760.27, 'total_tokens': 6916113, 'epoch': 0.18}
+  6%|██████▊                                                                                                          | 152/2499 [18:46<4:06:17,  6.30s/it]  6%|██████▉                                                                                                          | 153/2499 [18:52<4:06:09,  6.30s/it]                                                                                                                                                           {'loss': 0.6634, 'grad_norm': 0.15581682324409485, 'learning_rate': 0.00019839810614451434, 'ppl': 1.9414, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4622.18, 'total_tokens': 6945193, 'epoch': 0.18}
+  6%|██████▉                                                                                                          | 153/2499 [18:52<4:06:09,  6.30s/it]  6%|██████▉                                                                                                          | 154/2499 [18:58<4:05:45,  6.29s/it]                                                                                                                                                           {'loss': 0.6208, 'grad_norm': 0.14313741028308868, 'learning_rate': 0.00019837552631506592, 'ppl': 1.8604, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4588.89, 'total_tokens': 6973954, 'epoch': 0.18}
+  6%|██████▉                                                                                                          | 154/2499 [18:58<4:05:45,  6.29s/it]  6%|███████                                                                                                          | 155/2499 [19:04<4:05:27,  6.28s/it]                                                                                                                                                           {'loss': 0.652, 'grad_norm': 0.14645761251449585, 'learning_rate': 0.00019835278976109214, 'ppl': 1.9194, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4630.67, 'total_tokens': 7002988, 'epoch': 0.19}
+  6%|███████                                                                                                          | 155/2499 [19:04<4:05:27,  6.28s/it]  6%|███████                                                                                                          | 156/2499 [19:11<4:05:11,  6.28s/it]                                                                                                                                                           {'loss': 0.6053, 'grad_norm': 0.1450553685426712, 'learning_rate': 0.0001983298965188151, 'ppl': 1.8318, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4535.2, 'total_tokens': 7031406, 'epoch': 0.19}
+  6%|███████                                                                                                          | 156/2499 [19:11<4:05:11,  6.28s/it]  6%|███████                                                                                                          | 157/2499 [19:17<4:05:20,  6.29s/it]                                                                                                                                                           {'loss': 0.6132, 'grad_norm': 0.14832331240177155, 'learning_rate': 0.00019830684662470663, 'ppl': 1.8463, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4542.32, 'total_tokens': 7060015, 'epoch': 0.19}
+  6%|███████                                                                                                          | 157/2499 [19:17<4:05:20,  6.29s/it]  6%|███████▏                                                                                                         | 158/2499 [19:23<4:05:24,  6.29s/it]                                                                                                                                                           {'loss': 0.6337, 'grad_norm': 0.15093529224395752, 'learning_rate': 0.0001982836401154881, 'ppl': 1.8846, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4563.48, 'total_tokens': 7088745, 'epoch': 0.19}
+  6%|███████▏                                                                                                         | 158/2499 [19:23<4:05:24,  6.29s/it]  6%|███████▏                                                                                                         | 159/2499 [19:30<4:05:10,  6.29s/it]                                                                                                                                                           {'loss': 0.6969, 'grad_norm': 0.16975665092468262, 'learning_rate': 0.00019826027702813038, 'ppl': 2.0075, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4595.17, 'total_tokens': 7117583, 'epoch': 0.19}
+  6%|███████▏                                                                                                         | 159/2499 [19:30<4:05:10,  6.29s/it]  6%|███████▏                                                                                                         | 160/2499 [19:36<4:04:55,  6.28s/it]                                                                                                                                                           {'loss': 0.6083, 'grad_norm': 0.1516297310590744, 'learning_rate': 0.00019823675739985376, 'ppl': 1.8373, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4718.55, 'total_tokens': 7147178, 'epoch': 0.19}
+  6%|███████▏                                                                                                         | 160/2499 [19:36<4:04:55,  6.28s/it]  6%|███████▎                                                                                                         | 161/2499 [19:42<4:04:46,  6.28s/it]                                                                                                                                                           {'loss': 0.6185, 'grad_norm': 0.14229127764701843, 'learning_rate': 0.00019821308126812803, 'ppl': 1.8561, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4711.98, 'total_tokens': 7176755, 'epoch': 0.19}
+  6%|███████▎                                                                                                         | 161/2499 [19:42<4:04:46,  6.28s/it]  6%|███████▎                                                                                                         | 162/2499 [19:48<4:04:20,  6.27s/it]                                                                                                                                                           {'loss': 0.6169, 'grad_norm': 0.17252376675605774, 'learning_rate': 0.00019818924867067214, 'ppl': 1.8532, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4502.44, 'total_tokens': 7204896, 'epoch': 0.19}
+  6%|███████▎                                                                                                         | 162/2499 [19:48<4:04:20,  6.27s/it]  7%|███████▎                                                                                                         | 163/2499 [19:55<4:04:10,  6.27s/it]                                                                                                                                                           {'loss': 0.6053, 'grad_norm': 0.15479132533073425, 'learning_rate': 0.00019816525964545448, 'ppl': 1.8318, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4656.1, 'total_tokens': 7234072, 'epoch': 0.2}
+  7%|███████▎                                                                                                         | 163/2499 [19:55<4:04:10,  6.27s/it]  7%|███████▍                                                                                                         | 164/2499 [20:01<4:04:32,  6.28s/it]                                                                                                                                                           {'loss': 0.6358, 'grad_norm': 0.1458706557750702, 'learning_rate': 0.0001981411142306925, 'ppl': 1.8885, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4626.2, 'total_tokens': 7263258, 'epoch': 0.2}
+  7%|███████▍                                                                                                         | 164/2499 [20:01<4:04:32,  6.28s/it]  7%|███████▍                                                                                                         | 165/2499 [20:07<4:04:24,  6.28s/it]                                                                                                                                                           {'loss': 0.5665, 'grad_norm': 0.1417934000492096, 'learning_rate': 0.0001981168124648529, 'ppl': 1.7621, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4551.14, 'total_tokens': 7291824, 'epoch': 0.2}
+  7%|███████▍                                                                                                         | 165/2499 [20:07<4:04:24,  6.28s/it]  7%|███████▌                                                                                                         | 166/2499 [20:13<4:04:06,  6.28s/it]                                                                                                                                                           {'loss': 0.6314, 'grad_norm': 0.1490688920021057, 'learning_rate': 0.00019809235438665143, 'ppl': 1.8802, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4564.55, 'total_tokens': 7320418, 'epoch': 0.2}
+  7%|███████▌                                                                                                         | 166/2499 [20:13<4:04:06,  6.28s/it]  7%|███████▌                                                                                                         | 167/2499 [20:20<4:03:43,  6.27s/it]                                                                                                                                                           {'loss': 0.6009, 'grad_norm': 0.1549319177865982, 'learning_rate': 0.0001980677400350529, 'ppl': 1.8238, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4506.43, 'total_tokens': 7348591, 'epoch': 0.2}
+  7%|███████▌                                                                                                         | 167/2499 [20:20<4:03:43,  6.27s/it]  7%|███████▌                                                                                                         | 168/2499 [20:26<4:03:35,  6.27s/it]                                                                                                                                                           {'loss': 0.582, 'grad_norm': 0.1679680198431015, 'learning_rate': 0.000198042969449271, 'ppl': 1.7896, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4706.82, 'total_tokens': 7378083, 'epoch': 0.2}
+  7%|███████▌                                                                                                         | 168/2499 [20:26<4:03:35,  6.27s/it]  7%|███████▋                                                                                                         | 169/2499 [20:32<4:03:31,  6.27s/it]                                                                                                                                                           {'loss': 0.6688, 'grad_norm': 0.16335871815681458, 'learning_rate': 0.0001980180426687684, 'ppl': 1.9519, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4740.61, 'total_tokens': 7407810, 'epoch': 0.2}
+  7%|███████▋                                                                                                         | 169/2499 [20:32<4:03:31,  6.27s/it]  7%|███████▋                                                                                                         | 170/2499 [20:39<4:03:31,  6.27s/it]                                                                                                                                                           {'loss': 0.5984, 'grad_norm': 0.15233907103538513, 'learning_rate': 0.00019799295973325657, 'ppl': 1.8192, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4701.67, 'total_tokens': 7437325, 'epoch': 0.2}
+  7%|███████▋                                                                                                         | 170/2499 [20:39<4:03:31,  6.27s/it]  7%|███████▋                                                                                                         | 171/2499 [20:45<4:03:37,  6.28s/it]                                                                                                                                                           {'loss': 0.6533, 'grad_norm': 0.14838764071464539, 'learning_rate': 0.0001979677206826958, 'ppl': 1.9219, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4422.84, 'total_tokens': 7465136, 'epoch': 0.21}
+  7%|███████▋                                                                                                         | 171/2499 [20:45<4:03:37,  6.28s/it]  7%|███████▊                                                                                                         | 172/2499 [20:51<4:03:32,  6.28s/it]                                                                                                                                                           {'loss': 0.5928, 'grad_norm': 0.1395515352487564, 'learning_rate': 0.000197942325557295, 'ppl': 1.809, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4613.25, 'total_tokens': 7494094, 'epoch': 0.21}
+  7%|███████▊                                                                                                         | 172/2499 [20:51<4:03:32,  6.28s/it]  7%|███████▊                                                                                                         | 173/2499 [20:57<4:03:08,  6.27s/it]                                                                                                                                                           {'loss': 0.6164, 'grad_norm': 0.14091241359710693, 'learning_rate': 0.00019791677439751185, 'ppl': 1.8522, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4590.45, 'total_tokens': 7522794, 'epoch': 0.21}
+  7%|███████▊                                                                                                         | 173/2499 [20:57<4:03:08,  6.27s/it]  7%|███████▊                                                                                                         | 174/2499 [21:04<4:02:50,  6.27s/it]                                                                                                                                                           {'loss': 0.6732, 'grad_norm': 0.16553938388824463, 'learning_rate': 0.0001978910672440525, 'ppl': 1.9605, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4550.86, 'total_tokens': 7551247, 'epoch': 0.21}
+  7%|███████▊                                                                                                         | 174/2499 [21:04<4:02:50,  6.27s/it]  7%|███████▉                                                                                                         | 175/2499 [21:10<4:02:44,  6.27s/it]                                                                                                                                                           {'loss': 0.6298, 'grad_norm': 0.15987837314605713, 'learning_rate': 0.00019786520413787165, 'ppl': 1.8772, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4506.6, 'total_tokens': 7579481, 'epoch': 0.21}
+  7%|███████▉                                                                                                         | 175/2499 [21:10<4:02:44,  6.27s/it]  7%|███████▉                                                                                                         | 176/2499 [21:16<4:02:42,  6.27s/it]                                                                                                                                                           {'loss': 0.6511, 'grad_norm': 0.14235079288482666, 'learning_rate': 0.00019783918512017253, 'ppl': 1.9176, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4668.32, 'total_tokens': 7608756, 'epoch': 0.21}
+  7%|███████▉                                                                                                         | 176/2499 [21:16<4:02:42,  6.27s/it]  7%|████████                                                                                                         | 177/2499 [21:22<4:03:16,  6.29s/it]                                                                                                                                                           {'loss': 0.6042, 'grad_norm': 0.17243558168411255, 'learning_rate': 0.0001978130102324066, 'ppl': 1.8298, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4565.31, 'total_tokens': 7637623, 'epoch': 0.21}
+  7%|████████                                                                                                         | 177/2499 [21:22<4:03:16,  6.29s/it]  7%|████████                                                                                                         | 178/2499 [21:29<4:03:26,  6.29s/it]                                                                                                                                                           {'loss': 0.637, 'grad_norm': 0.16263476014137268, 'learning_rate': 0.00019778667951627382, 'ppl': 1.8908, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4526.74, 'total_tokens': 7666166, 'epoch': 0.21}
+  7%|████████                                                                                                         | 178/2499 [21:29<4:03:26,  6.29s/it]  7%|████████                                                                                                         | 179/2499 [21:35<4:03:04,  6.29s/it]                                                                                                                                                           {'loss': 0.6186, 'grad_norm': 0.15282128751277924, 'learning_rate': 0.00019776019301372225, 'ppl': 1.8563, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4439.86, 'total_tokens': 7693990, 'epoch': 0.21}
+  7%|████████                                                                                                         | 179/2499 [21:35<4:03:04,  6.29s/it]  7%|████████▏                                                                                                        | 180/2499 [21:41<4:02:51,  6.28s/it]                                                                                                                                                           {'loss': 0.6161, 'grad_norm': 0.14302721619606018, 'learning_rate': 0.00019773355076694826, 'ppl': 1.8517, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4753.91, 'total_tokens': 7723820, 'epoch': 0.22}
+  7%|████████▏                                                                                                        | 180/2499 [21:41<4:02:51,  6.28s/it]  7%|████████▏                                                                                                        | 181/2499 [21:48<4:02:32,  6.28s/it]                                                                                                                                                           {'loss': 0.6521, 'grad_norm': 0.1567981094121933, 'learning_rate': 0.00019770675281839624, 'ppl': 1.9196, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4552.84, 'total_tokens': 7752331, 'epoch': 0.22}
+  7%|████████▏                                                                                                        | 181/2499 [21:48<4:02:32,  6.28s/it]  7%|████████▏                                                                                                        | 182/2499 [21:54<4:02:22,  6.28s/it]                                                                                                                                                           {'loss': 0.6502, 'grad_norm': 0.16891400516033173, 'learning_rate': 0.00019767979921075866, 'ppl': 1.9159, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4489.31, 'total_tokens': 7780479, 'epoch': 0.22}
+  7%|████████▏                                                                                                        | 182/2499 [21:54<4:02:22,  6.28s/it]  7%|████████▎                                                                                                        | 183/2499 [22:00<4:02:16,  6.28s/it]                                                                                                                                                           {'loss': 0.62, 'grad_norm': 0.15879429876804352, 'learning_rate': 0.00019765268998697604, 'ppl': 1.8589, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4588.46, 'total_tokens': 7809267, 'epoch': 0.22}
+  7%|████████▎                                                                                                        | 183/2499 [22:00<4:02:16,  6.28s/it]  7%|████████▎                                                                                                        | 184/2499 [22:06<4:02:26,  6.28s/it]                                                                                                                                                           {'loss': 0.6378, 'grad_norm': 0.1598796546459198, 'learning_rate': 0.00019762542519023674, 'ppl': 1.8923, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4576.5, 'total_tokens': 7838088, 'epoch': 0.22}
+  7%|████████▎                                                                                                        | 184/2499 [22:06<4:02:26,  6.28s/it]  7%|████████▎                                                                                                        | 185/2499 [22:13<4:02:51,  6.30s/it]                                                                                                                                                           {'loss': 0.5584, 'grad_norm': 0.1714273989200592, 'learning_rate': 0.00019759800486397703, 'ppl': 1.7479, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4679.83, 'total_tokens': 7867688, 'epoch': 0.22}
+  7%|████████▎                                                                                                        | 185/2499 [22:13<4:02:51,  6.30s/it]  7%|████████▍                                                                                                        | 186/2499 [22:19<4:02:17,  6.29s/it]                                                                                                                                                           {'loss': 0.6139, 'grad_norm': 0.16586022078990936, 'learning_rate': 0.00019757042905188088, 'ppl': 1.8476, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4281.2, 'total_tokens': 7894459, 'epoch': 0.22}
+  7%|████████▍                                                                                                        | 186/2499 [22:19<4:02:17,  6.29s/it]  7%|████████▍                                                                                                        | 187/2499 [22:25<4:02:07,  6.28s/it]                                                                                                                                                           {'loss': 0.6282, 'grad_norm': 0.1663977950811386, 'learning_rate': 0.00019754269779788, 'ppl': 1.8742, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4602.7, 'total_tokens': 7923350, 'epoch': 0.22}
+  7%|████████▍                                                                                                        | 187/2499 [22:25<4:02:07,  6.28s/it]  8%|████████▌                                                                                                        | 188/2499 [22:32<4:01:49,  6.28s/it]                                                                                                                                                           {'loss': 0.5851, 'grad_norm': 0.1668008416891098, 'learning_rate': 0.0001975148111461538, 'ppl': 1.7952, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4571.75, 'total_tokens': 7951987, 'epoch': 0.23}
+  8%|████████▌                                                                                                        | 188/2499 [22:32<4:01:49,  6.28s/it]  8%|████████▌                                                                                                        | 189/2499 [22:38<4:01:32,  6.27s/it]                                                                                                                                                           {'loss': 0.626, 'grad_norm': 0.18379661440849304, 'learning_rate': 0.00019748676914112915, 'ppl': 1.8701, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4556.79, 'total_tokens': 7980520, 'epoch': 0.23}
+  8%|████████▌                                                                                                        | 189/2499 [22:38<4:01:32,  6.27s/it]  8%|████████▌                                                                                                        | 190/2499 [22:44<4:01:34,  6.28s/it]                                                                                                                                                           {'loss': 0.5925, 'grad_norm': 0.13806037604808807, 'learning_rate': 0.00019745857182748054, 'ppl': 1.8085, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4700.59, 'total_tokens': 8010056, 'epoch': 0.23}
+  8%|████████▌                                                                                                        | 190/2499 [22:44<4:01:34,  6.28s/it]  8%|████████▋                                                                                                        | 191/2499 [22:50<4:01:44,  6.28s/it]                                                                                                                                                           {'loss': 0.6313, 'grad_norm': 0.14297842979431152, 'learning_rate': 0.00019743021925012973, 'ppl': 1.8801, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4553.45, 'total_tokens': 8038737, 'epoch': 0.23}
+  8%|████████▋                                                                                                        | 191/2499 [22:50<4:01:44,  6.28s/it]  8%|████████▋                                                                                                        | 192/2499 [22:57<4:02:09,  6.30s/it]                                                                                                                                                           {'loss': 0.6269, 'grad_norm': 0.16967882215976715, 'learning_rate': 0.000197401711454246, 'ppl': 1.8718, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4698.64, 'total_tokens': 8068455, 'epoch': 0.23}
+  8%|████████▋                                                                                                        | 192/2499 [22:57<4:02:09,  6.30s/it]  8%|████████▋                                                                                                        | 193/2499 [23:03<4:01:52,  6.29s/it]                                                                                                                                                           {'loss': 0.5853, 'grad_norm': 0.15979325771331787, 'learning_rate': 0.0001973730484852458, 'ppl': 1.7955, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4662.01, 'total_tokens': 8097728, 'epoch': 0.23}
+  8%|████████▋                                                                                                        | 193/2499 [23:03<4:01:52,  6.29s/it]  8%|████████▊                                                                                                        | 194/2499 [23:09<4:01:24,  6.28s/it]                                                                                                                                                           {'loss': 0.6389, 'grad_norm': 0.1816360056400299, 'learning_rate': 0.00019734423038879283, 'ppl': 1.8944, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4387.73, 'total_tokens': 8125195, 'epoch': 0.23}
+  8%|████████▊                                                                                                        | 194/2499 [23:09<4:01:24,  6.28s/it]  8%|████████▊                                                                                                        | 195/2499 [23:16<4:01:06,  6.28s/it]                                                                                                                                                           {'loss': 0.5965, 'grad_norm': 0.14533467590808868, 'learning_rate': 0.00019731525721079793, 'ppl': 1.8158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4552.24, 'total_tokens': 8153711, 'epoch': 0.23}
+  8%|████████▊                                                                                                        | 195/2499 [23:16<4:01:06,  6.28s/it]  8%|████████▊                                                                                                        | 196/2499 [23:22<4:00:54,  6.28s/it]                                                                                                                                                           {'loss': 0.6688, 'grad_norm': 0.16294941306114197, 'learning_rate': 0.000197286128997419, 'ppl': 1.9519, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4513.63, 'total_tokens': 8182000, 'epoch': 0.24}
+  8%|████████▊                                                                                                        | 196/2499 [23:22<4:00:54,  6.28s/it]  8%|████████▉                                                                                                        | 197/2499 [23:28<4:00:37,  6.27s/it]                                                                                                                                                           {'loss': 0.638, 'grad_norm': 0.15876515209674835, 'learning_rate': 0.00019725684579506095, 'ppl': 1.8927, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4516.46, 'total_tokens': 8210270, 'epoch': 0.24}
+  8%|████████▉                                                                                                        | 197/2499 [23:28<4:00:37,  6.27s/it]  8%|████████▉                                                                                                        | 198/2499 [23:34<4:01:01,  6.28s/it]                                                                                                                                                           {'loss': 0.6244, 'grad_norm': 0.1551365852355957, 'learning_rate': 0.00019722740765037555, 'ppl': 1.8671, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4607.89, 'total_tokens': 8239361, 'epoch': 0.24}
+  8%|████████▉                                                                                                        | 198/2499 [23:34<4:01:01,  6.28s/it]  8%|████████▉                                                                                                        | 199/2499 [23:41<4:01:12,  6.29s/it]                                                                                                                                                           {'loss': 0.6537, 'grad_norm': 0.15418943762779236, 'learning_rate': 0.00019719781461026146, 'ppl': 1.9226, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4639.8, 'total_tokens': 8268621, 'epoch': 0.24}
+  8%|████████▉                                                                                                        | 199/2499 [23:41<4:01:12,  6.29s/it]  8%|█████████                                                                                                        | 200/2499 [23:47<4:00:55,  6.29s/it]                                                                                                                                                           {'loss': 0.6467, 'grad_norm': 0.15851524472236633, 'learning_rate': 0.00019716806672186412, 'ppl': 1.9092, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4664.04, 'total_tokens': 8297884, 'epoch': 0.24}
+  8%|█████████                                                                                                        | 200/2499 [23:47<4:00:55,  6.29s/it][2025-12-28 11:29:23,624] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:29:25,368] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8519337177276611
+[2025-12-28 11:29:26,219] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8512239456176758
+[2025-12-28 11:29:27,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8861675262451172
+[2025-12-28 11:29:27,946] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8390281200408936
+[2025-12-28 11:29:27,947] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+  0%|                                                                                                                               | 0/90 [00:00<?, ?it/s][A
+  2%|██▋                                                                                                                    | 2/90 [00:00<00:36,  2.41it/s][A
+  3%|███▉                                                                                                                   | 3/90 [00:01<00:54,  1.58it/s][A
+  4%|█████▎                                                                                                                 | 4/90 [00:02<01:00,  1.43it/s][A
+  6%|██████▌                                                                                                                | 5/90 [00:03<01:05,  1.30it/s][A
+  7%|███████▉                                                                                                               | 6/90 [00:04<01:05,  1.27it/s][A
+  8%|█████████▎                                                                                                             | 7/90 [00:05<01:08,  1.22it/s][A
+  9%|██████████▌                                                                                                            | 8/90 [00:06<01:07,  1.22it/s][A
+ 10%|███████████▉                                                                                                           | 9/90 [00:06<01:08,  1.19it/s][A
+ 11%|█████████████                                                                                                         | 10/90 [00:07<01:06,  1.20it/s][A
+ 12%|██████████████▍                                                                                                       | 11/90 [00:08<01:07,  1.17it/s][A
+ 13%|███████████████▋                                                                                                      | 12/90 [00:09<01:05,  1.19it/s][A
+ 14%|█████████████████                                                                                                     | 13/90 [00:10<01:06,  1.16it/s][A
+ 16%|██████████████████▎                                                                                                   | 14/90 [00:11<01:04,  1.18it/s][A
+ 17%|███████████████████▋                                                                                                  | 15/90 [00:12<01:04,  1.16it/s][A
+ 18%|████████████████████▉                                                                                                 | 16/90 [00:12<01:02,  1.18it/s][A
+ 19%|██████████████████████▎                                                                                               | 17/90 [00:13<01:02,  1.16it/s][A
+ 20%|███████████████████████▌                                                                                              | 18/90 [00:14<01:01,  1.18it/s][A
+ 21%|████████████████████████▉                                                                                             | 19/90 [00:15<01:01,  1.16it/s][A
+ 22%|██████████████████████████▏                                                                                           | 20/90 [00:16<00:59,  1.18it/s][A
+ 23%|███████████████████████████▌                                                                                          | 21/90 [00:17<00:59,  1.15it/s][A
+ 24%|████████████████████████████▊                                                                                         | 22/90 [00:18<00:58,  1.17it/s][A
+ 26%|██████████████████████████████▏                                                                                       | 23/90 [00:18<00:58,  1.15it/s][A
+ 27%|███████████████████████████████▍                                                                                      | 24/90 [00:19<00:56,  1.17it/s][A
+ 28%|████████████████████████████████▊                                                                                     | 25/90 [00:20<00:56,  1.15it/s][A
+ 29%|██████████████████████████████████                                                                                    | 26/90 [00:21<00:54,  1.17it/s][A
+ 30%|███████████████████████████████████▍                                                                                  | 27/90 [00:22<00:55,  1.14it/s][A
+ 31%|████████████████████████████████████▋                                                                                 | 28/90 [00:23<00:53,  1.17it/s][A
+ 32%|██████████████████████████████████████                                                                                | 29/90 [00:24<00:53,  1.13it/s][A
+ 33%|███████████████████████████████████████▎                                                                              | 30/90 [00:24<00:51,  1.16it/s][A
+ 34%|████████████████████████████████████████▋                                                                             | 31/90 [00:25<00:51,  1.14it/s][A
+ 36%|█████████████████████████████████████████▉                                                                            | 32/90 [00:26<00:49,  1.16it/s][A
+ 37%|███████████████████████████████████████████▎                                                                          | 33/90 [00:27<00:50,  1.13it/s][A
+ 38%|████████████████████████████████████████████▌                                                                         | 34/90 [00:28<00:48,  1.16it/s][A
+ 39%|█████████████████████████████████████████████▉                                                                        | 35/90 [00:29<00:48,  1.13it/s][A
+ 40%|███████████████████████████████████████████████▏                                                                      | 36/90 [00:30<00:46,  1.17it/s][A
+ 41%|████████████████████████████████████████████████▌                                                                     | 37/90 [00:31<00:46,  1.15it/s][A
+ 42%|█████████████████████████████████████████████████▊                                                                    | 38/90 [00:31<00:44,  1.17it/s][A
+ 43%|███████████████████���███████████████████████████████▏                                                                  | 39/90 [00:32<00:44,  1.15it/s][A
+ 44%|████████████████████████████████████████████████████▍                                                                 | 40/90 [00:33<00:42,  1.17it/s][A
+ 46%|█████████████████████████████████████████████████████▊                                                                | 41/90 [00:34<00:42,  1.15it/s][A
+ 47%|███████████████████████████████████████████████████████                                                               | 42/90 [00:35<00:41,  1.17it/s][A
+ 48%|████████████████████████████████████████████████████████▍                                                             | 43/90 [00:36<00:40,  1.15it/s][A
+ 49%|█████████████████████████████████████████████████████████▋                                                            | 44/90 [00:37<00:39,  1.16it/s][A
+ 50%|███████████████████████████████████████████████████████████                                                           | 45/90 [00:38<00:39,  1.14it/s][A
+ 51%|████████████████████████████████████████████████████████████▎                                                         | 46/90 [00:38<00:37,  1.16it/s][A
+ 52%|█████████████████████████████████████████████████████████████▌                                                        | 47/90 [00:39<00:37,  1.15it/s][A
+ 53%|██████████████████████████████████████████████████████████████▉                                                       | 48/90 [00:40<00:35,  1.17it/s][A
+ 54%|████████████████████████████████████████████████████████████████▏                                                     | 49/90 [00:41<00:35,  1.15it/s][A
+ 56%|█████████████████████████████████████████████████████████████████▌                                                    | 50/90 [00:42<00:34,  1.17it/s][A
+ 57%|██████████████████████████████████████████████████████████████████▊                                                   | 51/90 [00:43<00:34,  1.14it/s][A
+ 58%|████████████████████████████████████████████████████████████████████▏                                                 | 52/90 [00:43<00:32,  1.17it/s][A
+ 59%|█████████████████████████████████████████████████████████████████████▍                                                | 53/90 [00:44<00:32,  1.15it/s][A
+ 60%|██████████████████████████████████████████████████████████████████████▊                                               | 54/90 [00:45<00:30,  1.17it/s][A
+ 61%|████████████████████████████████████████████████████████████████████████                                              | 55/90 [00:47<00:34,  1.01it/s][A
+ 62%|█████████████████████████████████████████████████████████████████████████▍                                            | 56/90 [00:47<00:28,  1.18it/s][A
+ 63%|██████████████████████████████████████████████████████████████████████████▋                                           | 57/90 [00:48<00:28,  1.16it/s][A
+ 64%|████████████████████████████████████████████████████████████████████████████                                          | 58/90 [00:49<00:27,  1.18it/s][A
+ 66%|█████████████████████████████████████████████████████████████████████████████▎                                        | 59/90 [00:50<00:26,  1.16it/s][A
+ 67%|██████████████████████████████████████████████████████████████████████████████▋                                       | 60/90 [00:50<00:25,  1.18it/s][A
+ 68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 61/90 [00:51<00:25,  1.15it/s][A
+ 69%|█████████████████████████████████████████████████████████████████████████████████▎                                    | 62/90 [00:52<00:23,  1.17it/s][A
+ 70%|██████████████████████████████████████████████████████████████████████████████████▌                                   | 63/90 [00:53<00:26,  1.02it/s][A
+ 71%|███████████████████████████████████████████████████████████████████████████████████▉                                  | 64/90 [00:54<00:24,  1.07it/s][A
+ 72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 65/90 [00:55<00:23,  1.09it/s][A
+ 73%|██████████████████████████████████████████████████████████████████████████████████████▌                               | 66/90 [00:56<00:21,  1.12it/s][A
+ 74%|███████████████████████████████████████████████████████████████████████████████████████▊                              | 67/90 [00:57<00:20,  1.12it/s][A
+ 76%|█████████████████████████████████████████████████████████████████████████████████████████▏                            | 68/90 [00:58<00:19,  1.15it/s][A
+ 77%|██████████████████████████████████████████████████████████████████████████████████████████▍                           | 69/90 [00:59<00:18,  1.14it/s][A
+ 78%|███████████████████████████████████████████████████████████████████████████████████████████▊                          | 70/90 [00:59<00:17,  1.16it/s][A
+ 79%|█████████████████████████████████████████████████████████████████████████████████████████████                         | 71/90 [01:00<00:16,  1.15it/s][A
+ 80%|██████████████████████████████████████████████████████████████████████████████████████████████▍                       | 72/90 [01:01<00:15,  1.17it/s][A
+ 81%|███████████████████████████████████████████████████████████████████████████████████████████████▋                      | 73/90 [01:02<00:14,  1.14it/s][A
+ 82%|█████████████████████████████████████████████████████████████████████████████████████████████████                     | 74/90 [01:03<00:13,  1.17it/s][A
+ 83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 75/90 [01:04<00:13,  1.14it/s][A
+ 84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 76/90 [01:05<00:11,  1.17it/s][A
+ 86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 77/90 [01:06<00:11,  1.14it/s][A
+ 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 78/90 [01:06<00:10,  1.17it/s][A
+ 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 79/90 [01:07<00:09,  1.14it/s][A
+ 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 80/90 [01:08<00:08,  1.17it/s][A
+ 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 81/90 [01:09<00:07,  1.14it/s][A
+ 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 82/90 [01:10<00:06,  1.17it/s][A
+ 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 83/90 [01:11<00:06,  1.14it/s][A
+ 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 84/90 [01:12<00:05,  1.16it/s][A
+ 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 85/90 [01:12<00:04,  1.14it/s][A
+ 96%|█████████████████████████████████████████████████████████��██████████████████████████████████████████████████████▊     | 86/90 [01:13<00:03,  1.16it/s][A
+ 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 87/90 [01:14<00:02,  1.15it/s][A
+ 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 88/90 [01:15<00:01,  1.17it/s][A
+ 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00,  1.15it/s][A
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00,  1.15it/s][A                                                                                                                                                           
+                                                                                                                                                           [A{'eval_loss': 0.6098045110702515, 'eval_runtime': 79.6449, 'eval_samples_per_second': 9.166, 'eval_steps_per_second': 2.298, 'eval_ppl': 1.8401, 'memory/max_active (GiB)': 12.83, 'memory/max_allocated (GiB)': 6.85, 'memory/device_reserved (GiB)': 20.19, 'epoch': 0.24}
+  8%|█████████                                                                                                        | 200/2499 [25:11<4:00:55,  6.29s/it]
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00,  1.15it/s][A
+                                                                                                                                                           [A[2025-12-28 11:30:51,250] [INFO] [axolotl.core.trainers.base._save:692] [PID:42410] Saving model checkpoint to ./outputs/luau-codellama-h200/checkpoint-200