diff --git "a/debug.log" "b/debug.log"
new file mode 100644--- /dev/null
+++ "b/debug.log"
@@ -0,0 +1,763 @@
+[2025-12-28 11:04:35,744] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:42410] baseline 0.000GB ()
+[2025-12-28 11:04:35,746] [INFO] [axolotl.cli.config.load_cfg:256] [PID:42410] config:
+{
+ "activation_offloading": false,
+ "adapter": "lora",
+ "axolotl_config_path": "tuner.yaml",
+ "base_model": "codellama/CodeLlama-7b-hf",
+ "base_model_config": "codellama/CodeLlama-7b-hf",
+ "batch_size": 8,
+ "bf16": true,
+ "capabilities": {
+ "bf16": true,
+ "compute_capability": "sm_90",
+ "fp8": false,
+ "n_gpu": 1,
+ "n_node": 1
+ },
+ "chat_template": "llama3",
+ "context_parallel_size": 1,
+ "dataloader_num_workers": 1,
+ "dataloader_pin_memory": true,
+ "dataloader_prefetch_factor": 256,
+ "dataset_num_proc": 384,
+ "datasets": [
+ {
+ "chat_template": "tokenizer_default",
+ "conversation": "llama3",
+ "field_messages": "messages",
+ "message_property_mappings": {
+ "content": "content",
+ "role": "role"
+ },
+ "path": "darwinkernelpanic/luau-reasoning-normalized",
+ "trust_remote_code": false,
+ "type": "chat_template"
+ }
+ ],
+ "ddp": false,
+ "deepspeed": {
+ "bf16": {
+ "enabled": true
+ },
+ "fp16": {
+ "enabled": false
+ },
+ "gradient_accumulation_steps": "auto",
+ "gradient_clipping": 1.0,
+ "steps_per_print": 2000,
+ "train_micro_batch_size_per_gpu": "auto",
+ "zero_optimization": {
+ "contiguous_gradients": true,
+ "gather_16bit_weights_on_model_save": true,
+ "offload_optimizer": {
+ "device": "cpu",
+ "pin_memory": true
+ },
+ "offload_param": {
+ "device": "cpu",
+ "pin_memory": true
+ },
+ "overlap_comm": true,
+ "reduce_bucket_size": "auto",
+ "stage": 3,
+ "stage3_param_persistence_threshold": "auto",
+ "stage3_prefetch_bucket_size": "auto"
+ }
+ },
+ "device": "cuda:0",
+ "dion_rank_fraction": 1.0,
+ "dion_rank_multiple_of": 1,
+ "env_capabilities": {
+ "torch_version": "2.8.0"
+ },
+ "eval_batch_size": 4,
+ "eval_causal_lm_metrics": [
+ "sacrebleu",
+ "comet",
+ "ter",
+ "chrf"
+ ],
+ "eval_max_new_tokens": 128,
+ "eval_sample_packing": true,
+ "eval_steps": 100,
+ "eval_table_size": 0,
+ "experimental_skip_move_to_device": true,
+ "fp16": false,
+ "gradient_accumulation_steps": 2,
+ "gradient_checkpointing": true,
+ "gradient_checkpointing_kwargs": {
+ "use_reentrant": true
+ },
+ "group_by_length": true,
+ "hub_model_id": "darwinkernelpanic/luau-codellama-7b-reasoning",
+ "hub_strategy": "every_save",
+ "include_tkps": true,
+ "is_falcon_derived_model": false,
+ "is_llama_derived_model": true,
+ "is_mistral_derived_model": false,
+ "learning_rate": 0.0002,
+ "lisa_layers_attribute": "model.layers",
+ "load_best_model_at_end": false,
+ "load_in_4bit": false,
+ "load_in_8bit": false,
+ "local_rank": 0,
+ "logging_steps": 1,
+ "lora_alpha": 32,
+ "lora_dropout": 0.05,
+ "lora_r": 16,
+ "lora_target_modules": [
+ "q_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "loraplus_lr_embedding": 1e-06,
+ "lr_scheduler": "cosine",
+ "mean_resizing_embeddings": false,
+ "micro_batch_size": 4,
+ "model_config_type": "llama",
+ "num_epochs": 3.0,
+ "optimizer": "adamw_torch",
+ "otel_metrics_host": "localhost",
+ "otel_metrics_port": 8000,
+ "output_dir": "./outputs/luau-codellama-h200",
+ "pad_to_sequence_len": true,
+ "pretrain_multipack_attn": true,
+ "profiler_steps_start": 0,
+ "qlora_sharded_model_loading": false,
+ "ray_num_workers": 1,
+ "resources_per_worker": {
+ "GPU": 1
+ },
+ "sample_packing": true,
+ "sample_packing_bin_size": 200,
+ "sample_packing_group_size": 100000,
+ "save_only_model": false,
+ "save_safetensors": true,
+ "save_steps": 200,
+ "save_strategy": "steps",
+ "save_total_limit": 3,
+ "seed": 42,
+ "sequence_len": 4096,
+ "shuffle_before_merging_datasets": false,
+ "shuffle_merged_datasets": true,
+ "skip_prepare_dataset": false,
+ "streaming_multipack_buffer_size": 10000,
+ "strict": false,
+ "tensor_parallel_size": 1,
+ "tf32": true,
+ "tiled_mlp_use_original_mlp": true,
+ "tokenizer_config": "codellama/CodeLlama-7b-hf",
+ "tokenizer_save_jinja_files": true,
+ "tokenizer_type": "LlamaTokenizer",
+ "torch_dtype": "torch.bfloat16",
+ "train_on_inputs": false,
+ "trl": {
+ "log_completions": false,
+ "mask_truncated_completions": false,
+ "ref_model_mixup_alpha": 0.9,
+ "ref_model_sync_steps": 64,
+ "scale_rewards": true,
+ "sync_ref_model": false,
+ "use_vllm": false,
+ "vllm_server_host": "0.0.0.0",
+ "vllm_server_port": 8000
+ },
+ "type_of_model": "LlamaForCausalLM",
+ "use_otel_metrics": false,
+ "use_ray": false,
+ "val_set_size": 0.05,
+ "vllm": {
+ "device": "auto",
+ "dtype": "auto",
+ "gpu_memory_utilization": 0.9,
+ "host": "0.0.0.0",
+ "port": 8000
+ },
+ "warmup_steps": 10,
+ "weight_decay": 0.0,
+ "world_size": 1
+}
+[2025-12-28 11:04:36,377] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42410] EOS: 2 /
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42410] BOS: 1 /
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:42410] PAD: 2 /
+[2025-12-28 11:04:36,378] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:42410] UNK: 0 /
+[2025-12-28 11:04:36,378] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:42410] Unable to find prepared dataset in last_run_prepared/b7c17715ff7f64badeb455c51ab5d648
+[2025-12-28 11:04:36,378] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:42410] Loading raw datasets...
+[2025-12-28 11:04:36,378] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:42410] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
+[2025-12-28 11:04:38,127] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:42410] Loading dataset: darwinkernelpanic/luau-reasoning-normalized with base_type: chat_template and prompt_style: None
+[2025-12-28 11:04:38,130] [INFO] [axolotl.prompt_strategies.chat_template.__call__:996] [PID:42410] Using chat template:
+---
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
+
+' }}{% endif %}
+
+---
+[2025-12-28 11:04:38,137] [WARNING] [axolotl.prompt_strategies.chat_template._validate_eot_and_eos_tokens:337] [PID:42410] EOS token '' not found in chat_template. Please check if your template/EOS token is correct.
+[2025-12-28 11:04:38,508] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:42410] min_input_len: 636
+[2025-12-28 11:04:38,508] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:42410] max_input_len: 12839
+[2025-12-28 11:04:41,234] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:42410] Dropped 755 samples from dataset
+
Saving the dataset (0/56 shards): 0%| | 0/14586 [00:00, ? examples/s]
Saving the dataset (0/56 shards): 2%|█▎ | 261/14586 [00:01<00:55, 257.60 examples/s]
Saving the dataset (1/56 shards): 2%|█▎ | 261/14586 [00:01<00:55, 257.60 examples/s]
Saving the dataset (2/56 shards): 9%|██████▎ | 1305/14586 [00:01<00:51, 257.60 examples/s]
Saving the dataset (3/56 shards): 9%|██████▎ | 1305/14586 [00:01<00:51, 257.60 examples/s]
Saving the dataset (4/56 shards): 9%|██████▎ | 1305/14586 [00:01<00:51, 257.60 examples/s]
Saving the dataset (5/56 shards): 9%|██████▎ | 1305/14586 [00:01<00:51, 257.60 examples/s]
Saving the dataset (6/56 shards): 11%|███████▌ | 1566/14586 [00:01<00:50, 257.60 examples/s]
Saving the dataset (7/56 shards): 14%|██████████▏ | 2088/14586 [00:01<00:48, 257.60 examples/s]
Saving the dataset (8/56 shards): 18%|████████████▋ | 2610/14586 [00:01<00:46, 257.60 examples/s]
Saving the dataset (9/56 shards): 18%|████████████▋ | 2610/14586 [00:01<00:46, 257.60 examples/s]
Saving the dataset (10/56 shards): 18%|████████████▌ | 2610/14586 [00:01<00:46, 257.60 examples/s]
Saving the dataset (11/56 shards): 21%|███████████████ | 3132/14586 [00:01<00:44, 257.60 examples/s]
Saving the dataset (12/56 shards): 21%|███████████████ | 3132/14586 [00:01<00:44, 257.60 examples/s]
Saving the dataset (13/56 shards): 27%|██████████████████▊ | 3915/14586 [00:01<00:41, 257.60 examples/s]
Saving the dataset (14/56 shards): 27%|██████████████████▊ | 3915/14586 [00:01<00:41, 257.60 examples/s]
Saving the dataset (15/56 shards): 29%|████████████████████ | 4176/14586 [00:01<00:40, 257.60 examples/s]
Saving the dataset (16/56 shards): 29%|████████████████████ | 4176/14586 [00:01<00:40, 257.60 examples/s]
Saving the dataset (17/56 shards): 30%|█████████████████████▎ | 4437/14586 [00:01<00:39, 257.60 examples/s]
Saving the dataset (18/56 shards): 36%|█████████████████████████ | 5220/14586 [00:01<00:36, 257.60 examples/s]
Saving the dataset (19/56 shards): 36%|█████████████████████████ | 5220/14586 [00:01<00:36, 257.60 examples/s]
Saving the dataset (20/56 shards): 36%|█████████████████████████ | 5220/14586 [00:01<00:36, 257.60 examples/s]
Saving the dataset (21/56 shards): 39%|███████████████████████████▌ | 5742/14586 [00:01<00:34, 257.60 examples/s]
Saving the dataset (22/56 shards): 39%|███████████████████████████▌ | 5742/14586 [00:01<00:34, 257.60 examples/s]
Saving the dataset (23/56 shards): 45%|███████████████████████████████▎ | 6525/14586 [00:01<00:31, 257.60 examples/s]
Saving the dataset (24/56 shards): 45%|███████████████████████████████▎ | 6525/14586 [00:01<00:31, 257.60 examples/s]
Saving the dataset (25/56 shards): 45%|███████████████████████████████▎ | 6525/14586 [00:01<00:31, 257.60 examples/s]
Saving the dataset (26/56 shards): 50%|███████████████████████████████████ | 7306/14586 [00:01<00:28, 257.60 examples/s]
Saving the dataset (27/56 shards): 50%|███████████████████████████████████ | 7306/14586 [00:01<00:28, 257.60 examples/s]
Saving the dataset (28/56 shards): 50%|███████████████████████████████████ | 7306/14586 [00:01<00:28, 257.60 examples/s]
Saving the dataset (29/56 shards): 52%|████████████████████████████████████▎ | 7566/14586 [00:01<00:27, 257.60 examples/s]
Saving the dataset (30/56 shards): 55%|██████████████████████████████████████▊ | 8086/14586 [00:01<00:25, 257.60 examples/s]
Saving the dataset (31/56 shards): 57%|████████████████████████████████████████ | 8346/14586 [00:01<00:24, 257.60 examples/s]
Saving the dataset (32/56 shards): 57%|████████████████████████████████████████ | 8346/14586 [00:01<00:24, 257.60 examples/s]
Saving the dataset (33/56 shards): 64%|█████████████████████████████████████████████ | 9386/14586 [00:01<00:20, 257.60 examples/s]
Saving the dataset (34/56 shards): 64%|█████████████████████████████████████████████ | 9386/14586 [00:01<00:20, 257.60 examples/s]
Saving the dataset (35/56 shards): 64%|█████████████████████████████████████████████ | 9386/14586 [00:01<00:20, 257.60 examples/s]
Saving the dataset (36/56 shards): 66%|██████████████████████████████████████████████▎ | 9646/14586 [00:01<00:19, 257.60 examples/s]
Saving the dataset (37/56 shards): 66%|██████████████████████████████████████████████▎ | 9646/14586 [00:01<00:19, 257.60 examples/s]
Saving the dataset (38/56 shards): 68%|███████████████████████████████████████████████▌ | 9906/14586 [00:01<00:18, 257.60 examples/s]
Saving the dataset (39/56 shards): 71%|█████████████████████████████████████████████████▎ | 10426/14586 [00:01<00:16, 257.60 examples/s]
Saving the dataset (40/56 shards): 71%|█████████████████████████████████████████████████▎ | 10426/14586 [00:01<00:16, 257.60 examples/s]
Saving the dataset (41/56 shards): 73%|██████████████████████████████████████████████████▌ | 10686/14586 [00:01<00:15, 257.60 examples/s]
Saving the dataset (42/56 shards): 75%|███████████████████████████████████████████████████▊ | 10946/14586 [00:01<00:14, 257.60 examples/s]
Saving the dataset (43/56 shards): 77%|█████████████████████████████████████████████████████ | 11206/14586 [00:01<00:13, 257.60 examples/s]
Saving the dataset (44/56 shards): 80%|███████████████████████████████████████████████████████▍ | 11726/14586 [00:01<00:11, 257.60 examples/s]
Saving the dataset (45/56 shards): 80%|███████████████████████████████████████████████████████▍ | 11726/14586 [00:01<00:11, 257.60 examples/s]
Saving the dataset (46/56 shards): 86%|███████████████████████████████████████████████████████████▏ | 12506/14586 [00:01<00:08, 257.60 examples/s]
Saving the dataset (47/56 shards): 89%|█████████████████████████████████████████████████████████████▌ | 13026/14586 [00:01<00:06, 257.60 examples/s]
Saving the dataset (48/56 shards): 91%|██████████████████████████████████████████████████████████████▊ | 13286/14586 [00:01<00:05, 257.60 examples/s]
Saving the dataset (49/56 shards): 91%|██████████████████████████████████████████████████████████████▊ | 13286/14586 [00:01<00:05, 257.60 examples/s]
Saving the dataset (50/56 shards): 91%|██████████████████████████████████████████████████████████████▊ | 13286/14586 [00:01<00:05, 257.60 examples/s]
Saving the dataset (51/56 shards): 91%|██████████████████████████████████████████████████████████████▊ | 13286/14586 [00:01<00:05, 257.60 examples/s]
Saving the dataset (52/56 shards): 95%|█████████████████████████████████████████████████████████████████▎ | 13806/14586 [00:01<00:03, 257.60 examples/s]
Saving the dataset (53/56 shards): 95%|█████████████████████████████████████████████████████████████████▎ | 13806/14586 [00:01<00:03, 257.60 examples/s]
Saving the dataset (54/56 shards): 96%|██████████████████████████████████████████████████████████████████▌ | 14066/14586 [00:01<00:02, 257.60 examples/s]
Saving the dataset (55/56 shards): 100%|█████████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 257.60 examples/s]
Saving the dataset (56/56 shards): 100%|█████████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 257.60 examples/s]
Saving the dataset (56/56 shards): 100%|███████████████████████████████████████████████████████████████████| 14586/14586 [00:01<00:00, 13067.01 examples/s]
+[2025-12-28 11:04:48,679] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:42410] total_num_tokens: 1_357_721
+[2025-12-28 11:04:48,684] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:42410] `total_supervised_tokens: 1_271_453`
+[2025-12-28 11:04:48,692] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,247] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,552] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3049201965332031
+[2025-12-28 11:04:49,552] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:49,839] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.2874150276184082
+[2025-12-28 11:04:49,840] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:50,133] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.2929878234863281
+[2025-12-28 11:04:50,133] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.27963781356811523
+[2025-12-28 11:04:50,413] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:42410] data_loader_len: 45
+[2025-12-28 11:04:50,413] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:42410] sample_packing_eff_est across ranks: [0.9131538664342287]
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:42410] sample_packing_eff_est: None
+[2025-12-28 11:04:50,413] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:42410] total_num_steps: 135
+[2025-12-28 11:04:50,489] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:42410] total_num_tokens: 25_392_481
+[2025-12-28 11:04:50,608] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:42410] `total_supervised_tokens: 23_772_065`
+[2025-12-28 11:04:50,703] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,045] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,353] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.30963134765625
+[2025-12-28 11:04:51,355] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,664] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.31055235862731934
+[2025-12-28 11:04:51,666] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:51,976] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3107116222381592
+[2025-12-28 11:04:51,977] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:42410] Using single process for pack_parallel, running sequentially.
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.3080286979675293
+[2025-12-28 11:04:52,284] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [1667]
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:42410] data_loader_len: 833
+[2025-12-28 11:04:52,284] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:42410] sample_packing_eff_est across ranks: [0.9284613122121649]
+[2025-12-28 11:04:52,284] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:42410] sample_packing_eff_est: 0.93
+[2025-12-28 11:04:52,285] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:42410] total_num_steps: 2499
+[2025-12-28 11:04:52,287] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:42410] Maximum number of steps set at 2499
+[2025-12-28 11:04:52,291] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:42410] loading tokenizer... codellama/CodeLlama-7b-hf
+[2025-12-28 11:04:52,784] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:42410] EOS: 2 /
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:42410] BOS: 1 /
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:42410] PAD: 2 /
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:42410] UNK: 0 /
+[2025-12-28 11:04:52,785] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:42410] Loading model
+[2025-12-28 11:04:52,926] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:42410] Patched Trainer.evaluation_loop with nanmean loss calculation
+[2025-12-28 11:04:52,927] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:42410] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
+[2025-12-28 11:04:52,927] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:42410] Applying multipack dataloader patch for sample packing...
+[2025-12-28 11:04:52,927] [INFO] [axolotl.loaders.patch_manager._patch_llama_sample_packing:430] [PID:42410] Patching llama _prepare_4d_causal_attention_mask*...
+
Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]
Loading checkpoint shards: 50%|██████████████████████████████████████████████▌ | 1/2 [00:01<00:01, 1.42s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.14it/s]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.04it/s]
+
generation_config.json: 0%| | 0.00/116 [00:00, ?B/s]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 1.91MB/s]
+[2025-12-28 11:05:00,338] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:42410] Converting modules to torch.bfloat16
+[2025-12-28 11:05:00,339] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42410] Memory usage after model load 0.500GB (+0.500GB allocated, +0.510GB reserved)
+trainable params: 16,777,216 || all params: 6,755,323,904 || trainable%: 0.2484
+[2025-12-28 11:05:00,457] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:42410] after adapters 0.063GB (+0.063GB allocated, +0.572GB reserved)
+[2025-12-28 11:05:05,368] [INFO] [axolotl.train.save_initial_configs:413] [PID:42410] Pre-saving adapter config to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,368] [INFO] [axolotl.train.save_initial_configs:417] [PID:42410] Pre-saving tokenizer to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,369] [INFO] [axolotl.train.save_initial_configs:422] [PID:42410] Pre-saving model config to ./outputs/luau-codellama-h200...
+[2025-12-28 11:05:05,370] [INFO] [axolotl.train.execute_training:212] [PID:42410] Starting trainer...
+[2025-12-28 11:05:07,213] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6575620174407959
+[2025-12-28 11:05:07,824] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6103956699371338
+[2025-12-28 11:05:08,456] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.6313827037811279
+[2025-12-28 11:05:09,110] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.653618574142456
+[2025-12-28 11:05:09,110] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [1666]
+Parameter Offload - Persistent parameters statistics: param_count = 65, numel = 266240
+
0%| | 0/2499 [00:00, ?it/s][2025-12-28 11:05:36,131] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:05:37,595] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7198138236999512
+[2025-12-28 11:05:38,326] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7297320365905762
+[2025-12-28 11:05:39,063] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7372677326202393
+[2025-12-28 11:05:39,815] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.7512753009796143
+[2025-12-28 11:05:39,815] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+
0%| | 0/90 [00:00, ?it/s][A
+
2%|██▋ | 2/90 [00:00<00:25, 3.46it/s][A
+
3%|███▉ | 3/90 [00:01<00:42, 2.04it/s][A
+
4%|█████▎ | 4/90 [00:02<00:52, 1.65it/s][A
+
6%|██████▌ | 5/90 [00:03<01:06, 1.29it/s][A
+
7%|███████▉ | 6/90 [00:04<01:04, 1.29it/s][A
+
8%|█████████▎ | 7/90 [00:04<01:07, 1.23it/s][A
+
9%|██████████▌ | 8/90 [00:05<01:06, 1.23it/s][A
+
10%|███████████▉ | 9/90 [00:06<01:07, 1.20it/s][A
+
11%|█████████████ | 10/90 [00:07<01:06, 1.21it/s][A
+
12%|██████████████▍ | 11/90 [00:08<01:07, 1.17it/s][A
+
13%|███████████████▋ | 12/90 [00:09<01:05, 1.19it/s][A
+
14%|█████████████████ | 13/90 [00:10<01:06, 1.16it/s][A
+
16%|██████████████████▎ | 14/90 [00:10<01:04, 1.18it/s][A
+
17%|███████████████████▋ | 15/90 [00:11<01:05, 1.15it/s][A
+
18%|████████████████████▉ | 16/90 [00:12<01:02, 1.18it/s][A
+
19%|██████████████████████▎ | 17/90 [00:13<01:03, 1.16it/s][A
+
20%|███████████████████████▌ | 18/90 [00:14<01:01, 1.18it/s][A
+
21%|████████████████████████▉ | 19/90 [00:15<01:01, 1.16it/s][A
+
22%|██████████████████████████▏ | 20/90 [00:16<00:59, 1.18it/s][A
+
23%|███████████████████████████▌ | 21/90 [00:16<00:59, 1.15it/s][A
+
24%|████████████████████████████▊ | 22/90 [00:17<00:57, 1.18it/s][A
+
26%|██████████████████████████████▏ | 23/90 [00:18<00:59, 1.13it/s][A
+
27%|███████████████████████████████▍ | 24/90 [00:19<00:56, 1.16it/s][A
+
28%|████████████████████████████████▊ | 25/90 [00:20<00:57, 1.14it/s][A
+
29%|██████████████████████████████████ | 26/90 [00:21<00:54, 1.16it/s][A
+
30%|███████████████████████████████████▍ | 27/90 [00:22<00:55, 1.14it/s][A
+
31%|████████████████████████████████████▋ | 28/90 [00:22<00:53, 1.17it/s][A
+
32%|██████████████████████████████████████ | 29/90 [00:23<00:53, 1.14it/s][A
+
33%|███████████████████████████████████████▎ | 30/90 [00:24<00:51, 1.16it/s][A
+
34%|████████████████████████████████████████▋ | 31/90 [00:25<00:52, 1.13it/s][A
+
36%|█████████████████████████████████████████▉ | 32/90 [00:26<00:50, 1.16it/s][A
+
37%|███████████████████████████████████████████▎ | 33/90 [00:27<00:50, 1.13it/s][A
+
38%|████████████████████████████████████████████▌ | 34/90 [00:28<00:48, 1.16it/s][A
+
39%|█████████████████████████████████████████████▉ | 35/90 [00:29<00:48, 1.14it/s][A
+
40%|███████████████████████████████████████████████▏ | 36/90 [00:29<00:46, 1.17it/s][A
+
41%|████████████████████████████████████████████████▌ | 37/90 [00:30<00:46, 1.14it/s][A
+
42%|█████████████████████████████████████████████████▊ | 38/90 [00:31<00:44, 1.17it/s][A
+
43%|███████████████████████████████████████████████████▏ | 39/90 [00:32<00:44, 1.14it/s][A
+
44%|████████████████████████████████████████████████████▍ | 40/90 [00:33<00:42, 1.17it/s][A
+
46%|█████████████████████████████████████████████████████▊ | 41/90 [00:34<00:42, 1.15it/s][A
+
47%|███████████████████████████████████████████████████████ | 42/90 [00:35<00:40, 1.17it/s][A
+
48%|████████████████████████████████████████████████████████▍ | 43/90 [00:36<00:40, 1.15it/s][A
+
49%|█████████████████████████████████████████████████████████▋ | 44/90 [00:36<00:39, 1.17it/s][A
+
50%|███████████████████████████████████████████████████████████ | 45/90 [00:37<00:39, 1.15it/s][A
+
51%|████████████████████████████████████████████████████████████▎ | 46/90 [00:38<00:37, 1.17it/s][A
+
52%|█████████████████████████████████████████████████████████████▌ | 47/90 [00:39<00:37, 1.14it/s][A
+
53%|██████████████████████████████████████████████████████████████▉ | 48/90 [00:40<00:36, 1.17it/s][A
+
54%|████████████████████████████████████████████████████████████████▏ | 49/90 [00:41<00:35, 1.15it/s][A
+
56%|█████████████████████████████████████████████████████████████████▌ | 50/90 [00:42<00:34, 1.17it/s][A
+
57%|██████████████████████████████████████████████████████████████████▊ | 51/90 [00:42<00:33, 1.15it/s][A
+
58%|████████████████████████████████████████████████████████████████████▏ | 52/90 [00:43<00:32, 1.17it/s][A
+
59%|█████████████████████████████████████████████████████████████████████▍ | 53/90 [00:44<00:32, 1.15it/s][A
+
60%|██████████████████████████████████████████████████████████████████████▊ | 54/90 [00:45<00:30, 1.17it/s][A
+
61%|████████████████████████████████████████████████████████████████████████ | 55/90 [00:46<00:30, 1.14it/s][A
+
62%|█████████████████████████████████████████████████████████████████████████▍ | 56/90 [00:47<00:29, 1.16it/s][A
+
63%|██████████████████████████████████████████████████████████████████████████▋ | 57/90 [00:48<00:28, 1.14it/s][A
+
64%|████████████████████████████████████████████████████████████████████████████ | 58/90 [00:48<00:27, 1.16it/s][A
+
66%|█████████████████████████████████████████████████████████████████████████████▎ | 59/90 [00:49<00:27, 1.14it/s][A
+
67%|██████████████████████████████████████████████████████████████████████████████▋ | 60/90 [00:50<00:25, 1.16it/s][A
+
68%|█████████████████████████████████████████████��█████████████████████████████████▉ | 61/90 [00:51<00:25, 1.14it/s][A
+
69%|█████████████████████████████████████████████████████████████████████████████████▎ | 62/90 [00:52<00:24, 1.16it/s][A
+
70%|██████████████████████████████████████████████████████████████████████████████████▌ | 63/90 [00:53<00:23, 1.13it/s][A
+
71%|███████████████████████████████████████████████████████████████████████████████████▉ | 64/90 [00:54<00:22, 1.15it/s][A
+
72%|█████████████████████████████████████████████████████████████████████████████████████▏ | 65/90 [00:55<00:21, 1.14it/s][A
+
73%|██████████████████████████████████████████████████████████████████████████████████████▌ | 66/90 [00:55<00:20, 1.16it/s][A
+
74%|███████████████████████████████████████████████████████████████████████████████████████▊ | 67/90 [00:56<00:20, 1.14it/s][A
+
76%|█████████████████████████████████████████████████████████████████████████████████████████▏ | 68/90 [00:57<00:18, 1.17it/s][A
+
77%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 69/90 [00:58<00:19, 1.06it/s][A
+
78%|███████████████████████████████████████████████████████████████████████████████████████████▊ | 70/90 [00:59<00:18, 1.11it/s][A
+
79%|█████████████████████████████████████████████████████████████████████████████████████████████ | 71/90 [01:00<00:17, 1.11it/s][A
+
80%|██████████████████████████████████████████████████████████████████████████████████████████████▍ | 72/90 [01:01<00:15, 1.14it/s][A
+
81%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 73/90 [01:02<00:14, 1.13it/s][A
+
82%|█████████████████████████████████████████████████████████████████████████████████████████████████ | 74/90 [01:03<00:13, 1.16it/s][A
+
83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎ | 75/90 [01:03<00:13, 1.13it/s][A
+
84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋ | 76/90 [01:04<00:12, 1.16it/s][A
+
86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 77/90 [01:05<00:11, 1.13it/s][A
+
87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 78/90 [01:06<00:10, 1.16it/s][A
+
88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 79/90 [01:07<00:09, 1.13it/s][A
+
89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 80/90 [01:08<00:08, 1.16it/s][A
+
90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 81/90 [01:09<00:07, 1.14it/s][A
+
91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 82/90 [01:09<00:06, 1.16it/s][A
+
92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 83/90 [01:10<00:06, 1.14it/s][A
+
93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 84/90 [01:11<00:05, 1.16it/s][A
+
94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 85/90 [01:12<00:04, 1.13it/s][A
+
96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 86/90 [01:13<00:03, 1.16it/s][A
+
97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 87/90 [01:14<00:02, 1.15it/s][A
+
98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 88/90 [01:15<00:01, 1.17it/s][A
+
99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00, 1.15it/s][A
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:16<00:00, 1.16it/s][A
+
[A{'eval_loss': 1.6886017322540283, 'eval_runtime': 79.9199, 'eval_samples_per_second': 9.134, 'eval_steps_per_second': 2.29, 'eval_ppl': 5.4119, 'memory/max_active (GiB)': 11.16, 'memory/max_allocated (GiB)': 5.19, 'memory/device_reserved (GiB)': 13.81, 'epoch': 0}
+
0%| | 0/2499 [01:23, ?it/s]
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.16it/s][A
+
[A
0%| | 1/2499 [01:32<64:06:25, 92.39s/it]
{'loss': 2.0336, 'grad_norm': 1.6855894327163696, 'learning_rate': 0.0, 'ppl': 7.6415, 'memory/max_active (GiB)': 16.07, 'memory/max_allocated (GiB)': 10.54, 'memory/device_reserved (GiB)': 18.02, 'tokens_per_second_per_gpu': 197269.0, 'total_tokens': 1298183, 'epoch': 0.0}
+
0%| | 1/2499 [01:32<64:06:25, 92.39s/it]
0%| | 2/2499 [01:38<28:57:02, 41.74s/it]
{'loss': 1.7737, 'grad_norm': 1.1572575569152832, 'learning_rate': 2e-05, 'ppl': 5.8926, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.97, 'tokens_per_second_per_gpu': 4746.09, 'total_tokens': 1327999, 'epoch': 0.0}
+
0%| | 2/2499 [01:38<28:57:02, 41.74s/it]
0%|▏ | 3/2499 [01:44<17:42:14, 25.53s/it]
{'loss': 1.85, 'grad_norm': 1.594330072402954, 'learning_rate': 4e-05, 'ppl': 6.3598, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.99, 'tokens_per_second_per_gpu': 4622.21, 'total_tokens': 1356883, 'epoch': 0.0}
+
0%|▏ | 3/2499 [01:44<17:42:14, 25.53s/it]
0%|▏ | 4/2499 [01:51<12:25:28, 17.93s/it]
{'loss': 1.6567, 'grad_norm': 1.557888150215149, 'learning_rate': 6e-05, 'ppl': 5.242, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 19.99, 'tokens_per_second_per_gpu': 4690.94, 'total_tokens': 1386261, 'epoch': 0.0}
+
0%|▏ | 4/2499 [01:51<12:25:28, 17.93s/it]
0%|▏ | 5/2499 [01:57<9:30:55, 13.74s/it]
{'loss': 1.9046, 'grad_norm': 1.6567342281341553, 'learning_rate': 8e-05, 'ppl': 6.7167, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4507.97, 'total_tokens': 1414659, 'epoch': 0.01}
+
0%|▏ | 5/2499 [01:57<9:30:55, 13.74s/it]
0%|▎ | 6/2499 [02:03<7:45:45, 11.21s/it]
{'loss': 1.8432, 'grad_norm': 1.6043676137924194, 'learning_rate': 0.0001, 'ppl': 6.3167, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4597.87, 'total_tokens': 1443641, 'epoch': 0.01}
+
0%|▎ | 6/2499 [02:03<7:45:45, 11.21s/it]
0%|▎ | 7/2499 [02:10<6:38:32, 9.60s/it]
{'loss': 1.492, 'grad_norm': 1.2741687297821045, 'learning_rate': 0.00012, 'ppl': 4.446, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4542.53, 'total_tokens': 1472125, 'epoch': 0.01}
+
0%|▎ | 7/2499 [02:10<6:38:32, 9.60s/it]
0%|▎ | 8/2499 [02:16<5:54:25, 8.54s/it]
{'loss': 1.4809, 'grad_norm': 1.3272074460983276, 'learning_rate': 0.00014, 'ppl': 4.3969, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4670.47, 'total_tokens': 1501396, 'epoch': 0.01}
+
0%|▎ | 8/2499 [02:16<5:54:25, 8.54s/it]
0%|▍ | 9/2499 [02:22<5:24:48, 7.83s/it]
{'loss': 1.238, 'grad_norm': 1.0670270919799805, 'learning_rate': 0.00016, 'ppl': 3.4487, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4547.02, 'total_tokens': 1529874, 'epoch': 0.01}
+
0%|▍ | 9/2499 [02:22<5:24:48, 7.83s/it]
0%|▍ | 10/2499 [02:28<5:04:46, 7.35s/it]
{'loss': 1.2017, 'grad_norm': 0.9426001906394958, 'learning_rate': 0.00018, 'ppl': 3.3258, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4686.7, 'total_tokens': 1559258, 'epoch': 0.01}
+
0%|▍ | 10/2499 [02:28<5:04:46, 7.35s/it]
0%|▌ | 11/2499 [02:35<4:51:01, 7.02s/it]
{'loss': 1.1605, 'grad_norm': 0.8342238664627075, 'learning_rate': 0.0002, 'ppl': 3.1915, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4637.04, 'total_tokens': 1588337, 'epoch': 0.01}
+
0%|▌ | 11/2499 [02:35<4:51:01, 7.02s/it]
0%|▌ | 12/2499 [02:41<4:42:06, 6.81s/it]
{'loss': 1.2037, 'grad_norm': 0.9213444590568542, 'learning_rate': 0.00019999992034374237, 'ppl': 3.3324, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4643.51, 'total_tokens': 1617675, 'epoch': 0.01}
+
0%|▌ | 12/2499 [02:41<4:42:06, 6.81s/it]
1%|▌ | 13/2499 [02:47<4:35:33, 6.65s/it]
{'loss': 1.0463, 'grad_norm': 0.5648354887962341, 'learning_rate': 0.0001999996813750963, 'ppl': 2.8471, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4691.68, 'total_tokens': 1647182, 'epoch': 0.02}
+
1%|▌ | 13/2499 [02:47<4:35:33, 6.65s/it]
1%|▋ | 14/2499 [02:54<4:30:45, 6.54s/it]
{'loss': 1.0009, 'grad_norm': 0.4093482494354248, 'learning_rate': 0.0001999992830944426, 'ppl': 2.7207, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4583.72, 'total_tokens': 1675932, 'epoch': 0.02}
+
1%|▋ | 14/2499 [02:54<4:30:45, 6.54s/it]
1%|▋ | 15/2499 [03:00<4:27:25, 6.46s/it]
{'loss': 1.0439, 'grad_norm': 0.6911133527755737, 'learning_rate': 0.0001999987255024157, 'ppl': 2.8403, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4700.02, 'total_tokens': 1705435, 'epoch': 0.02}
+
1%|▋ | 15/2499 [03:00<4:27:25, 6.46s/it]
1%|▋ | 16/2499 [03:06<4:24:59, 6.40s/it]
{'loss': 1.0052, 'grad_norm': 0.647537112236023, 'learning_rate': 0.0001999980085999039, 'ppl': 2.7325, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4640.77, 'total_tokens': 1734534, 'epoch': 0.02}
+
1%|▋ | 16/2499 [03:06<4:24:59, 6.40s/it]
1%|▊ | 17/2499 [03:12<4:23:03, 6.36s/it]
{'loss': 0.8606, 'grad_norm': 0.24260607361793518, 'learning_rate': 0.0001999971323880494, 'ppl': 2.3646, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4574.84, 'total_tokens': 1763148, 'epoch': 0.02}
+
1%|▊ | 17/2499 [03:12<4:23:03, 6.36s/it]
1%|▊ | 18/2499 [03:19<4:21:50, 6.33s/it]
{'loss': 0.9237, 'grad_norm': 0.34218189120292664, 'learning_rate': 0.00019999609686824802, 'ppl': 2.5186, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4659.05, 'total_tokens': 1792345, 'epoch': 0.02}
+
1%|▊ | 18/2499 [03:19<4:21:50, 6.33s/it]
1%|▊ | 19/2499 [03:25<4:21:38, 6.33s/it]
{'loss': 0.8695, 'grad_norm': 0.6931776404380798, 'learning_rate': 0.00019999490204214958, 'ppl': 2.3857, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4737.23, 'total_tokens': 1822296, 'epoch': 0.02}
+
1%|▊ | 19/2499 [03:25<4:21:38, 6.33s/it]
1%|▉ | 20/2499 [03:31<4:21:01, 6.32s/it]
{'loss': 0.8121, 'grad_norm': 0.29975464940071106, 'learning_rate': 0.00019999354791165749, 'ppl': 2.2526, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4681.06, 'total_tokens': 1851717, 'epoch': 0.02}
+
1%|▉ | 20/2499 [03:31<4:21:01, 6.32s/it]
1%|▉ | 21/2499 [03:38<4:20:42, 6.31s/it]
{'loss': 0.8624, 'grad_norm': 0.25352585315704346, 'learning_rate': 0.0001999920344789291, 'ppl': 2.3688, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4727.48, 'total_tokens': 1881491, 'epoch': 0.03}
+
1%|▉ | 21/2499 [03:38<4:20:42, 6.31s/it]
1%|█ | 22/2499 [03:44<4:20:30, 6.31s/it]
{'loss': 0.8812, 'grad_norm': 0.3950115144252777, 'learning_rate': 0.00019999036174637546, 'ppl': 2.4138, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4758.76, 'total_tokens': 1911483, 'epoch': 0.03}
+
1%|█ | 22/2499 [03:44<4:20:30, 6.31s/it]
1%|█ | 23/2499 [03:50<4:19:54, 6.30s/it]
{'loss': 0.819, 'grad_norm': 0.24307860434055328, 'learning_rate': 0.0001999885297166615, 'ppl': 2.2682, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4582.96, 'total_tokens': 1940207, 'epoch': 0.03}
+
1%|█ | 23/2499 [03:50<4:19:54, 6.30s/it]
1%|█ | 24/2499 [03:56<4:19:14, 6.28s/it]
{'loss': 0.7642, 'grad_norm': 0.17830020189285278, 'learning_rate': 0.00019998653839270583, 'ppl': 2.1473, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4526.68, 'total_tokens': 1968501, 'epoch': 0.03}
+
1%|█ | 24/2499 [03:56<4:19:14, 6.28s/it]
1%|█▏ | 25/2499 [04:03<4:19:12, 6.29s/it]
{'loss': 0.7952, 'grad_norm': 0.1788649559020996, 'learning_rate': 0.0001999843877776809, 'ppl': 2.2149, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4562.49, 'total_tokens': 1997194, 'epoch': 0.03}
+
1%|█▏ | 25/2499 [04:03<4:19:12, 6.29s/it]
1%|█▏ | 26/2499 [04:09<4:19:12, 6.29s/it]
{'loss': 0.8073, 'grad_norm': 0.24912691116333008, 'learning_rate': 0.00019998207787501286, 'ppl': 2.2418, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4474.8, 'total_tokens': 2025344, 'epoch': 0.03}
+
1%|█▏ | 26/2499 [04:09<4:19:12, 6.29s/it]
1%|█▏ | 27/2499 [04:15<4:18:55, 6.28s/it]
{'loss': 0.7831, 'grad_norm': 0.21209484338760376, 'learning_rate': 0.00019997960868838174, 'ppl': 2.1882, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4661.31, 'total_tokens': 2054571, 'epoch': 0.03}
+
1%|█▏ | 27/2499 [04:15<4:18:55, 6.28s/it]
1%|█▎ | 28/2499 [04:21<4:18:38, 6.28s/it]
{'loss': 0.7746, 'grad_norm': 0.216914564371109, 'learning_rate': 0.0001999769802217212, 'ppl': 2.1697, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4733.21, 'total_tokens': 2084241, 'epoch': 0.03}
+
1%|█▎ | 28/2499 [04:21<4:18:38, 6.28s/it]
1%|█▎ | 29/2499 [04:28<4:18:15, 6.27s/it]
{'loss': 0.8475, 'grad_norm': 0.207558274269104, 'learning_rate': 0.0001999741924792188, 'ppl': 2.3338, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4546.06, 'total_tokens': 2112679, 'epoch': 0.03}
+
1%|█▎ | 29/2499 [04:28<4:18:15, 6.27s/it]
1%|█▎ | 30/2499 [04:34<4:17:50, 6.27s/it]
{'loss': 0.7692, 'grad_norm': 0.21438081562519073, 'learning_rate': 0.0001999712454653157, 'ppl': 2.158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4561.33, 'total_tokens': 2141169, 'epoch': 0.04}
+
1%|█▎ | 30/2499 [04:34<4:17:50, 6.27s/it]
1%|█▍ | 31/2499 [04:40<4:17:31, 6.26s/it]
{'loss': 0.7869, 'grad_norm': 0.16961662471294403, 'learning_rate': 0.00019996813918470686, 'ppl': 2.1966, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4358.55, 'total_tokens': 2168390, 'epoch': 0.04}
+
1%|█▍ | 31/2499 [04:40<4:17:31, 6.26s/it]
1%|█▍ | 32/2499 [04:47<4:17:50, 6.27s/it]
{'loss': 0.8634, 'grad_norm': 0.18904076516628265, 'learning_rate': 0.000199964873642341, 'ppl': 2.3712, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4506.81, 'total_tokens': 2196748, 'epoch': 0.04}
+
1%|█▍ | 32/2499 [04:47<4:17:50, 6.27s/it]
1%|█▌ | 33/2499 [04:53<4:18:20, 6.29s/it]
{'loss': 0.7711, 'grad_norm': 0.16406087577342987, 'learning_rate': 0.0001999614488434205, 'ppl': 2.1621, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4605.35, 'total_tokens': 2225839, 'epoch': 0.04}
+
1%|█▌ | 33/2499 [04:53<4:18:20, 6.29s/it]
1%|█▌ | 34/2499 [04:59<4:18:37, 6.30s/it]
{'loss': 0.7774, 'grad_norm': 0.15022194385528564, 'learning_rate': 0.00019995786479340156, 'ppl': 2.1758, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4747.45, 'total_tokens': 2255812, 'epoch': 0.04}
+
1%|█▌ | 34/2499 [04:59<4:18:37, 6.30s/it]
1%|█▌ | 35/2499 [05:05<4:18:16, 6.29s/it]
{'loss': 0.6847, 'grad_norm': 0.11543940007686615, 'learning_rate': 0.00019995412149799395, 'ppl': 1.9832, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4510.07, 'total_tokens': 2284100, 'epoch': 0.04}
+
1%|█▌ | 35/2499 [05:05<4:18:16, 6.29s/it]
1%|█▋ | 36/2499 [05:12<4:17:51, 6.28s/it]
{'loss': 0.7787, 'grad_norm': 0.1670907884836197, 'learning_rate': 0.00019995021896316128, 'ppl': 2.1786, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4492.67, 'total_tokens': 2312230, 'epoch': 0.04}
+
1%|█▋ | 36/2499 [05:12<4:17:51, 6.28s/it]
1%|█▋ | 37/2499 [05:18<4:17:29, 6.28s/it]
{'loss': 0.7615, 'grad_norm': 0.19045475125312805, 'learning_rate': 0.00019994615719512072, 'ppl': 2.1415, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4537.58, 'total_tokens': 2340625, 'epoch': 0.04}
+
1%|█▋ | 37/2499 [05:18<4:17:29, 6.28s/it]
2%|█▋ | 38/2499 [05:24<4:17:12, 6.27s/it]
{'loss': 0.841, 'grad_norm': 0.13640637695789337, 'learning_rate': 0.00019994193620034314, 'ppl': 2.3187, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4534.95, 'total_tokens': 2369006, 'epoch': 0.05}
+
2%|█▋ | 38/2499 [05:24<4:17:12, 6.27s/it]
2%|█▊ | 39/2499 [05:31<4:17:41, 6.28s/it]
{'loss': 0.8279, 'grad_norm': 0.15840484201908112, 'learning_rate': 0.00019993755598555322, 'ppl': 2.2885, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4723.1, 'total_tokens': 2398832, 'epoch': 0.05}
+
2%|█▊ | 39/2499 [05:31<4:17:41, 6.28s/it]
2%|█▊ | 40/2499 [05:37<4:17:47, 6.29s/it]
{'loss': 0.6928, 'grad_norm': 0.13987034559249878, 'learning_rate': 0.0001999330165577291, 'ppl': 1.9993, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4510.37, 'total_tokens': 2427243, 'epoch': 0.05}
+
2%|█▊ | 40/2499 [05:37<4:17:47, 6.29s/it]
2%|█▊ | 41/2499 [05:43<4:17:35, 6.29s/it]
{'loss': 0.7248, 'grad_norm': 0.13921092450618744, 'learning_rate': 0.00019992831792410272, 'ppl': 2.0643, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4685.94, 'total_tokens': 2456661, 'epoch': 0.05}
+
2%|█▊ | 41/2499 [05:43<4:17:35, 6.29s/it]
2%|█▉ | 42/2499 [05:49<4:17:18, 6.28s/it]
{'loss': 0.7406, 'grad_norm': 0.12492494285106659, 'learning_rate': 0.0001999234600921595, 'ppl': 2.0972, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4612.0, 'total_tokens': 2485581, 'epoch': 0.05}
+
2%|█▉ | 42/2499 [05:49<4:17:18, 6.28s/it]
2%|█▉ | 43/2499 [05:56<4:17:02, 6.28s/it]
{'loss': 0.7535, 'grad_norm': 0.12467890232801437, 'learning_rate': 0.00019991844306963872, 'ppl': 2.1244, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4568.82, 'total_tokens': 2514221, 'epoch': 0.05}
+
2%|█▉ | 43/2499 [05:56<4:17:02, 6.28s/it]
2%|██ | 44/2499 [06:02<4:16:41, 6.27s/it]
{'loss': 0.7356, 'grad_norm': 0.1306881159543991, 'learning_rate': 0.000199913266864533, 'ppl': 2.0867, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4386.29, 'total_tokens': 2541665, 'epoch': 0.05}
+
2%|██ | 44/2499 [06:02<4:16:41, 6.27s/it]
2%|██ | 45/2499 [06:08<4:16:23, 6.27s/it]
{'loss': 0.7163, 'grad_norm': 0.1349906027317047, 'learning_rate': 0.0001999079314850887, 'ppl': 2.0468, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4404.7, 'total_tokens': 2569218, 'epoch': 0.05}
+
2%|██ | 45/2499 [06:08<4:16:23, 6.27s/it]
2%|██ | 46/2499 [06:15<4:16:55, 6.28s/it]
{'loss': 0.6931, 'grad_norm': 0.14203360676765442, 'learning_rate': 0.0001999024369398058, 'ppl': 1.9999, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4625.96, 'total_tokens': 2598443, 'epoch': 0.06}
+
2%|██ | 46/2499 [06:15<4:16:55, 6.28s/it]
2%|██▏ | 47/2499 [06:21<4:17:25, 6.30s/it]
{'loss': 0.7034, 'grad_norm': 0.1235819086432457, 'learning_rate': 0.00019989678323743774, 'ppl': 2.0206, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4575.29, 'total_tokens': 2627402, 'epoch': 0.06}
+
2%|██▏ | 47/2499 [06:21<4:17:25, 6.30s/it]
2%|██▏ | 48/2499 [06:27<4:17:16, 6.30s/it]
{'loss': 0.7176, 'grad_norm': 0.14084498584270477, 'learning_rate': 0.00019989097038699164, 'ppl': 2.0495, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4732.23, 'total_tokens': 2657177, 'epoch': 0.06}
+
2%|██▏ | 48/2499 [06:27<4:17:16, 6.30s/it]
2%|██▏ | 49/2499 [06:33<4:17:13, 6.30s/it]
{'loss': 0.7038, 'grad_norm': 0.12469019740819931, 'learning_rate': 0.00019988499839772804, 'ppl': 2.0214, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4607.81, 'total_tokens': 2686207, 'epoch': 0.06}
+
2%|██▏ | 49/2499 [06:33<4:17:13, 6.30s/it]
2%|██▎ | 50/2499 [06:40<4:16:45, 6.29s/it]
{'loss': 0.6652, 'grad_norm': 0.12172164767980576, 'learning_rate': 0.0001998788672791611, 'ppl': 1.9449, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4587.16, 'total_tokens': 2714952, 'epoch': 0.06}
+
2%|██▎ | 50/2499 [06:40<4:16:45, 6.29s/it]
2%|██▎ | 51/2499 [06:46<4:16:32, 6.29s/it]
{'loss': 0.7439, 'grad_norm': 0.12937241792678833, 'learning_rate': 0.00019987257704105844, 'ppl': 2.1041, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4660.7, 'total_tokens': 2744218, 'epoch': 0.06}
+
2%|██▎ | 51/2499 [06:46<4:16:32, 6.29s/it]
2%|██▎ | 52/2499 [06:52<4:16:41, 6.29s/it]
{'loss': 0.7108, 'grad_norm': 0.1375284045934677, 'learning_rate': 0.0001998661276934412, 'ppl': 2.0356, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4450.95, 'total_tokens': 2772286, 'epoch': 0.06}
+
2%|██▎ | 52/2499 [06:52<4:16:41, 6.29s/it]
2%|██▍ | 53/2499 [06:59<4:16:42, 6.30s/it]
{'loss': 0.7404, 'grad_norm': 0.12681901454925537, 'learning_rate': 0.000199859519246584, 'ppl': 2.0968, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4493.7, 'total_tokens': 2800601, 'epoch': 0.06}
+
2%|██▍ | 53/2499 [06:59<4:16:42, 6.30s/it]
2%|██▍ | 54/2499 [07:05<4:16:46, 6.30s/it]
{'loss': 0.7666, 'grad_norm': 0.1492014229297638, 'learning_rate': 0.00019985275171101495, 'ppl': 2.1524, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4497.77, 'total_tokens': 2828969, 'epoch': 0.06}
+
2%|██▍ | 54/2499 [07:05<4:16:46, 6.30s/it]
2%|██▌ | 55/2499 [07:11<4:16:50, 6.31s/it]
{'loss': 0.723, 'grad_norm': 0.12260715663433075, 'learning_rate': 0.00019984582509751552, 'ppl': 2.0606, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4470.31, 'total_tokens': 2857188, 'epoch': 0.07}
+
2%|██▌ | 55/2499 [07:11<4:16:50, 6.31s/it]
2%|██▌ | 56/2499 [07:18<4:16:52, 6.31s/it]
{'loss': 0.7153, 'grad_norm': 0.1511772871017456, 'learning_rate': 0.00019983873941712072, 'ppl': 2.0448, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4408.94, 'total_tokens': 2885021, 'epoch': 0.07}
+
2%|██▌ | 56/2499 [07:18<4:16:52, 6.31s/it]
2%|██▌ | 57/2499 [07:24<4:16:36, 6.31s/it]
{'loss': 0.6686, 'grad_norm': 0.12503519654273987, 'learning_rate': 0.00019983149468111894, 'ppl': 1.9515, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4573.97, 'total_tokens': 2913799, 'epoch': 0.07}
+
2%|██▌ | 57/2499 [07:24<4:16:36, 6.31s/it]
2%|██▋ | 58/2499 [07:30<4:16:27, 6.30s/it]
{'loss': 0.7522, 'grad_norm': 0.12792782485485077, 'learning_rate': 0.0001998240909010519, 'ppl': 2.1217, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4703.97, 'total_tokens': 2943426, 'epoch': 0.07}
+
2%|██▋ | 58/2499 [07:30<4:16:27, 6.30s/it]
2%|██▋ | 59/2499 [07:36<4:16:22, 6.30s/it]
{'loss': 0.6882, 'grad_norm': 0.12937703728675842, 'learning_rate': 0.00019981652808871475, 'ppl': 1.9901, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4699.04, 'total_tokens': 2973043, 'epoch': 0.07}
+
2%|██▋ | 59/2499 [07:36<4:16:22, 6.30s/it]
2%|██▋ | 60/2499 [07:43<4:16:07, 6.30s/it]
{'loss': 0.6602, 'grad_norm': 0.12878933548927307, 'learning_rate': 0.00019980880625615604, 'ppl': 1.9352, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4500.91, 'total_tokens': 3001352, 'epoch': 0.07}
+
2%|██▋ | 60/2499 [07:43<4:16:07, 6.30s/it]
2%|██▊ | 61/2499 [07:49<4:15:36, 6.29s/it]
{'loss': 0.6685, 'grad_norm': 0.13316965103149414, 'learning_rate': 0.00019980092541567763, 'ppl': 1.9513, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4517.16, 'total_tokens': 3029652, 'epoch': 0.07}
+
2%|██▊ | 61/2499 [07:49<4:15:36, 6.29s/it]
2%|██▊ | 62/2499 [07:55<4:15:45, 6.30s/it]
{'loss': 0.7736, 'grad_norm': 0.12971599400043488, 'learning_rate': 0.0001997928855798346, 'ppl': 2.1676, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4444.53, 'total_tokens': 3057692, 'epoch': 0.07}
+
2%|██▊ | 62/2499 [07:55<4:15:45, 6.30s/it]
3%|██▊ | 63/2499 [08:02<4:15:43, 6.30s/it]
{'loss': 0.6215, 'grad_norm': 0.11753156036138535, 'learning_rate': 0.0001997846867614355, 'ppl': 1.8617, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4651.6, 'total_tokens': 3086990, 'epoch': 0.08}
+
3%|██▊ | 63/2499 [08:02<4:15:43, 6.30s/it]
3%|██▉ | 64/2499 [08:08<4:15:42, 6.30s/it]
{'loss': 0.6703, 'grad_norm': 0.14658862352371216, 'learning_rate': 0.00019977632897354202, 'ppl': 1.9548, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4746.31, 'total_tokens': 3116909, 'epoch': 0.08}
+
3%|██▉ | 64/2499 [08:08<4:15:42, 6.30s/it]
3%|██▉ | 65/2499 [08:14<4:15:28, 6.30s/it]
{'loss': 0.6798, 'grad_norm': 0.12969624996185303, 'learning_rate': 0.00019976781222946918, 'ppl': 1.9735, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4652.71, 'total_tokens': 3146161, 'epoch': 0.08}
+
3%|██▉ | 65/2499 [08:14<4:15:28, 6.30s/it]
3%|███ | 66/2499 [08:21<4:15:23, 6.30s/it]
{'loss': 0.6765, 'grad_norm': 0.20642466843128204, 'learning_rate': 0.00019975913654278525, 'ppl': 1.967, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4720.42, 'total_tokens': 3175889, 'epoch': 0.08}
+
3%|███ | 66/2499 [08:21<4:15:23, 6.30s/it]
3%|███ | 67/2499 [08:27<4:14:54, 6.29s/it]
{'loss': 0.6657, 'grad_norm': 0.12067057937383652, 'learning_rate': 0.0001997503019273116, 'ppl': 1.9459, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4506.21, 'total_tokens': 3204118, 'epoch': 0.08}
+
3%|███ | 67/2499 [08:27<4:14:54, 6.29s/it]
3%|███ | 68/2499 [08:33<4:14:44, 6.29s/it]
{'loss': 0.6175, 'grad_norm': 0.12278411537408829, 'learning_rate': 0.000199741308397123, 'ppl': 1.8543, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4567.1, 'total_tokens': 3232803, 'epoch': 0.08}
+
3%|███ | 68/2499 [08:33<4:14:44, 6.29s/it]
3%|███▏ | 69/2499 [08:39<4:15:04, 6.30s/it]
{'loss': 0.6619, 'grad_norm': 0.13150422275066376, 'learning_rate': 0.00019973215596654715, 'ppl': 1.9385, 'memory/max_active (GiB)': 17.82, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4495.32, 'total_tokens': 3261216, 'epoch': 0.08}
+
3%|███▏ | 69/2499 [08:39<4:15:04, 6.30s/it]
3%|███▏ | 70/2499 [08:46<4:14:55, 6.30s/it]
{'loss': 0.7218, 'grad_norm': 0.1392705738544464, 'learning_rate': 0.0001997228446501651, 'ppl': 2.0581, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4586.02, 'total_tokens': 3290070, 'epoch': 0.08}
+
3%|███▏ | 70/2499 [08:46<4:14:55, 6.30s/it]
3%|███▏ | 71/2499 [08:52<4:14:32, 6.29s/it]
{'loss': 0.7086, 'grad_norm': 0.15434479713439941, 'learning_rate': 0.00019971337446281087, 'ppl': 2.0311, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4580.13, 'total_tokens': 3318793, 'epoch': 0.09}
+
3%|███▏ | 71/2499 [08:52<4:14:32, 6.29s/it]
3%|███▎ | 72/2499 [08:58<4:14:10, 6.28s/it]
{'loss': 0.7222, 'grad_norm': 0.1450231820344925, 'learning_rate': 0.00019970374541957174, 'ppl': 2.059, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4561.96, 'total_tokens': 3347382, 'epoch': 0.09}
+
3%|███▎ | 72/2499 [08:58<4:14:10, 6.28s/it]
3%|███▎ | 73/2499 [09:05<4:14:10, 6.29s/it]
{'loss': 0.6646, 'grad_norm': 0.14817385375499725, 'learning_rate': 0.00019969395753578794, 'ppl': 1.9437, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4675.51, 'total_tokens': 3376788, 'epoch': 0.09}
+
3%|███▎ | 73/2499 [09:05<4:14:10, 6.29s/it]
3%|███▍ | 74/2499 [09:11<4:13:54, 6.28s/it]
{'loss': 0.6898, 'grad_norm': 0.131875678896904, 'learning_rate': 0.00019968401082705276, 'ppl': 1.9933, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4524.14, 'total_tokens': 3405160, 'epoch': 0.09}
+
3%|███▍ | 74/2499 [09:11<4:13:54, 6.28s/it]
3%|███▍ | 75/2499 [09:17<4:14:00, 6.29s/it]
{'loss': 0.6869, 'grad_norm': 0.1403125524520874, 'learning_rate': 0.0001996739053092126, 'ppl': 1.9875, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4577.87, 'total_tokens': 3433985, 'epoch': 0.09}
+
3%|███▍ | 75/2499 [09:17<4:14:00, 6.29s/it]
3%|███▍ | 76/2499 [09:23<4:14:26, 6.30s/it]
{'loss': 0.6758, 'grad_norm': 0.137966588139534, 'learning_rate': 0.00019966364099836681, 'ppl': 1.9656, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4609.11, 'total_tokens': 3463148, 'epoch': 0.09}
+
3%|███▍ | 76/2499 [09:23<4:14:26, 6.30s/it]
3%|███▌ | 77/2499 [09:30<4:14:24, 6.30s/it]
{'loss': 0.6669, 'grad_norm': 0.13154162466526031, 'learning_rate': 0.00019965321791086768, 'ppl': 1.9482, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4764.08, 'total_tokens': 3493170, 'epoch': 0.09}
+
3%|███▌ | 77/2499 [09:30<4:14:24, 6.30s/it]
3%|███▌ | 78/2499 [09:36<4:13:47, 6.29s/it]
{'loss': 0.6681, 'grad_norm': 0.1396287977695465, 'learning_rate': 0.00019964263606332051, 'ppl': 1.9505, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4335.49, 'total_tokens': 3520301, 'epoch': 0.09}
+
3%|███▌ | 78/2499 [09:36<4:13:47, 6.29s/it]
3%|███▌ | 79/2499 [09:42<4:13:27, 6.28s/it]
{'loss': 0.6799, 'grad_norm': 0.1356486976146698, 'learning_rate': 0.00019963189547258356, 'ppl': 1.9737, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4451.59, 'total_tokens': 3548202, 'epoch': 0.09}
+
3%|███▌ | 79/2499 [09:42<4:13:27, 6.28s/it]
3%|███▋ | 80/2499 [09:49<4:13:19, 6.28s/it]
{'loss': 0.6697, 'grad_norm': 0.14252781867980957, 'learning_rate': 0.0001996209961557679, 'ppl': 1.9537, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4500.44, 'total_tokens': 3576462, 'epoch': 0.1}
+
3%|███▋ | 80/2499 [09:49<4:13:19, 6.28s/it]
3%|███▋ | 81/2499 [09:55<4:13:09, 6.28s/it]
{'loss': 0.7155, 'grad_norm': 0.14615966379642487, 'learning_rate': 0.00019960993813023745, 'ppl': 2.0452, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4544.14, 'total_tokens': 3604983, 'epoch': 0.1}
+
3%|███▋ | 81/2499 [09:55<4:13:09, 6.28s/it]
3%|███▋ | 82/2499 [10:01<4:13:23, 6.29s/it]
{'loss': 0.6172, 'grad_norm': 0.13786305487155914, 'learning_rate': 0.0001995987214136091, 'ppl': 1.8537, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4671.06, 'total_tokens': 3634442, 'epoch': 0.1}
+
3%|███▋ | 82/2499 [10:01<4:13:23, 6.29s/it]
3%|███▊ | 83/2499 [10:07<4:13:31, 6.30s/it]
{'loss': 0.6399, 'grad_norm': 0.14883151650428772, 'learning_rate': 0.00019958734602375247, 'ppl': 1.8963, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4347.61, 'total_tokens': 3661862, 'epoch': 0.1}
+
3%|███▊ | 83/2499 [10:07<4:13:31, 6.30s/it]
3%|███▊ | 84/2499 [10:14<4:13:26, 6.30s/it]
{'loss': 0.6619, 'grad_norm': 0.1344694346189499, 'learning_rate': 0.00019957581197878996, 'ppl': 1.9385, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4744.18, 'total_tokens': 3691718, 'epoch': 0.1}
+
3%|███▊ | 84/2499 [10:14<4:13:26, 6.30s/it]
3%|███▉ | 85/2499 [10:20<4:13:08, 6.29s/it]
{'loss': 0.7284, 'grad_norm': 0.12591156363487244, 'learning_rate': 0.00019956411929709678, 'ppl': 2.0718, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4638.44, 'total_tokens': 3720842, 'epoch': 0.1}
+
3%|███▉ | 85/2499 [10:20<4:13:08, 6.29s/it]
3%|███▉ | 86/2499 [10:26<4:12:45, 6.29s/it]
{'loss': 0.6502, 'grad_norm': 0.1308436542749405, 'learning_rate': 0.00019955226799730081, 'ppl': 1.9159, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4529.63, 'total_tokens': 3749228, 'epoch': 0.1}
+
3%|███▉ | 86/2499 [10:26<4:12:45, 6.29s/it]
3%|███▉ | 87/2499 [10:33<4:12:38, 6.28s/it]
{'loss': 0.6609, 'grad_norm': 0.13323400914669037, 'learning_rate': 0.00019954025809828266, 'ppl': 1.9365, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4566.95, 'total_tokens': 3777912, 'epoch': 0.1}
+
3%|███▉ | 87/2499 [10:33<4:12:38, 6.28s/it]
4%|████ | 88/2499 [10:39<4:12:35, 6.29s/it]
{'loss': 0.6446, 'grad_norm': 0.16586028039455414, 'learning_rate': 0.00019952808961917558, 'ppl': 1.9052, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4553.1, 'total_tokens': 3806539, 'epoch': 0.11}
+
4%|████ | 88/2499 [10:39<4:12:35, 6.29s/it]
4%|████ | 89/2499 [10:45<4:12:53, 6.30s/it]
{'loss': 0.6663, 'grad_norm': 0.14273381233215332, 'learning_rate': 0.0001995157625793655, 'ppl': 1.947, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4543.06, 'total_tokens': 3835239, 'epoch': 0.11}
+
4%|████ | 89/2499 [10:45<4:12:53, 6.30s/it]
4%|████ | 90/2499 [10:51<4:12:49, 6.30s/it]
{'loss': 0.6725, 'grad_norm': 0.15345992147922516, 'learning_rate': 0.00019950327699849098, 'ppl': 1.9591, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4596.19, 'total_tokens': 3864175, 'epoch': 0.11}
+
4%|████ | 90/2499 [10:52<4:12:49, 6.30s/it]
4%|████▏ | 91/2499 [10:58<4:12:38, 6.29s/it]
{'loss': 0.7163, 'grad_norm': 0.16092751920223236, 'learning_rate': 0.00019949063289644302, 'ppl': 2.0468, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4597.41, 'total_tokens': 3893079, 'epoch': 0.11}
+
4%|████▏ | 91/2499 [10:58<4:12:38, 6.29s/it]
4%|████▏ | 92/2499 [11:04<4:12:21, 6.29s/it]
{'loss': 0.6764, 'grad_norm': 0.13062061369419098, 'learning_rate': 0.00019947783029336533, 'ppl': 1.9668, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4599.4, 'total_tokens': 3921954, 'epoch': 0.11}
+
4%|████▏ | 92/2499 [11:04<4:12:21, 6.29s/it]
4%|████▏ | 93/2499 [11:10<4:12:13, 6.29s/it]
{'loss': 0.6585, 'grad_norm': 0.14627501368522644, 'learning_rate': 0.00019946486920965404, 'ppl': 1.9319, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4612.44, 'total_tokens': 3950949, 'epoch': 0.11}
+
4%|████▏ | 93/2499 [11:10<4:12:13, 6.29s/it]
4%|████▎ | 94/2499 [11:17<4:12:09, 6.29s/it]
{'loss': 0.6825, 'grad_norm': 0.14802932739257812, 'learning_rate': 0.00019945174966595777, 'ppl': 1.9788, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4644.39, 'total_tokens': 3980160, 'epoch': 0.11}
+
4%|████▎ | 94/2499 [11:17<4:12:09, 6.29s/it]
4%|████▎ | 95/2499 [11:23<4:11:43, 6.28s/it]
{'loss': 0.6535, 'grad_norm': 0.151302307844162, 'learning_rate': 0.0001994384716831776, 'ppl': 1.9223, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4503.85, 'total_tokens': 4008363, 'epoch': 0.11}
+
4%|████▎ | 95/2499 [11:23<4:11:43, 6.28s/it]
4%|████▍ | 96/2499 [11:29<4:12:15, 6.30s/it]
{'loss': 0.6404, 'grad_norm': 0.15178830921649933, 'learning_rate': 0.000199425035282467, 'ppl': 1.8972, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4664.08, 'total_tokens': 4037899, 'epoch': 0.12}
+
4%|████▍ | 96/2499 [11:29<4:12:15, 6.30s/it]
4%|████▍ | 97/2499 [11:36<4:12:23, 6.30s/it]
{'loss': 0.7097, 'grad_norm': 0.1457069218158722, 'learning_rate': 0.0001994114404852319, 'ppl': 2.0334, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4667.4, 'total_tokens': 4067373, 'epoch': 0.12}
+
4%|████▍ | 97/2499 [11:36<4:12:23, 6.30s/it]
4%|████▍ | 98/2499 [11:42<4:11:56, 6.30s/it]
{'loss': 0.6538, 'grad_norm': 0.13825637102127075, 'learning_rate': 0.00019939768731313046, 'ppl': 1.9228, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4609.94, 'total_tokens': 4096295, 'epoch': 0.12}
+
4%|████▍ | 98/2499 [11:42<4:11:56, 6.30s/it]
4%|████▌ | 99/2499 [11:48<4:11:49, 6.30s/it]
{'loss': 0.6082, 'grad_norm': 0.14136871695518494, 'learning_rate': 0.00019938377578807318, 'ppl': 1.8371, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4684.31, 'total_tokens': 4125771, 'epoch': 0.12}
+
4%|████▌ | 99/2499 [11:48<4:11:49, 6.30s/it]
4%|████▌ | 100/2499 [11:54<4:11:44, 6.30s/it]
{'loss': 0.6605, 'grad_norm': 0.1564965695142746, 'learning_rate': 0.0001993697059322229, 'ppl': 1.9358, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4555.83, 'total_tokens': 4154447, 'epoch': 0.12}
+
4%|████▌ | 100/2499 [11:54<4:11:44, 6.30s/it][2025-12-28 11:17:31,070] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:17:32,807] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8711647987365723
+[2025-12-28 11:17:33,641] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8331155776977539
+[2025-12-28 11:17:34,487] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8463048934936523
+[2025-12-28 11:17:35,331] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8430600166320801
+[2025-12-28 11:17:35,331] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+
0%| | 0/90 [00:00, ?it/s][A
+
2%|██▋ | 2/90 [00:00<00:36, 2.42it/s][A
+
3%|███▉ | 3/90 [00:01<00:55, 1.57it/s][A
+
4%|█████▎ | 4/90 [00:02<01:00, 1.42it/s][A
+
6%|██████▌ | 5/90 [00:03<01:06, 1.28it/s][A
+
7%|███████▉ | 6/90 [00:04<01:06, 1.26it/s][A
+
8%|█████████▎ | 7/90 [00:05<01:10, 1.17it/s][A
+
9%|██████████▌ | 8/90 [00:06<01:09, 1.19it/s][A
+
10%|███████████▉ | 9/90 [00:07<01:10, 1.15it/s][A
+
11%|█████████████ | 10/90 [00:07<01:08, 1.18it/s][A
+
12%|██████████████▍ | 11/90 [00:08<01:09, 1.14it/s][A
+
13%|███████████████▋ | 12/90 [00:09<01:06, 1.17it/s][A
+
14%|█��███████████████ | 13/90 [00:10<01:07, 1.13it/s][A
+
16%|██████████████████▎ | 14/90 [00:11<01:05, 1.16it/s][A
+
17%|███████████████████▋ | 15/90 [00:12<01:05, 1.14it/s][A
+
18%|████████████████████▉ | 16/90 [00:13<01:03, 1.16it/s][A
+
19%|██████████████████████▎ | 17/90 [00:14<01:04, 1.13it/s][A
+
20%|███████████████████████▌ | 18/90 [00:14<01:01, 1.17it/s][A
+
21%|████████████████████████▉ | 19/90 [00:15<01:01, 1.15it/s][A
+
22%|██████████████████████████▏ | 20/90 [00:16<00:59, 1.17it/s][A
+
23%|███████████████████████████▌ | 21/90 [00:17<01:00, 1.14it/s][A
+
24%|████████████████████████████▊ | 22/90 [00:18<00:58, 1.17it/s][A
+
26%|██████████████████████████████▏ | 23/90 [00:19<00:58, 1.15it/s][A
+
27%|███████████████████████████████▍ | 24/90 [00:20<00:56, 1.17it/s][A
+
28%|████████████████████████████████▊ | 25/90 [00:20<00:56, 1.15it/s][A
+
29%|██████████████████████████████████ | 26/90 [00:21<00:54, 1.17it/s][A
+
30%|███████████████████████████████████▍ | 27/90 [00:22<00:54, 1.15it/s][A
+
31%|████████████████████████████████████▋ | 28/90 [00:23<00:52, 1.17it/s][A
+
32%|██████████████████████████████████████ | 29/90 [00:24<00:53, 1.15it/s][A
+
33%|███████████████████████████████████████▎ | 30/90 [00:25<00:51, 1.17it/s][A
+
34%|████████████████████████████████████████▋ | 31/90 [00:26<00:51, 1.15it/s][A
+
36%|█████████████████████████████████████████▉ | 32/90 [00:26<00:49, 1.17it/s][A
+
37%|███████████████████████████████████████████▎ | 33/90 [00:27<00:49, 1.15it/s][A
+
38%|████████████████████████████████████████████▌ | 34/90 [00:28<00:47, 1.17it/s][A
+
39%|█████████████████████████████████████████████▉ | 35/90 [00:29<00:47, 1.15it/s][A
+
40%|███████████████████████████████████████████████▏ | 36/90 [00:30<00:45, 1.17it/s][A
+
41%|████████████████████████████████████████████████▌ | 37/90 [00:31<00:45, 1.15it/s][A
+
42%|█████████████████████████████████████████████████▊ | 38/90 [00:32<00:44, 1.17it/s][A
+
43%|███████████████████████████████████████████████████▏ | 39/90 [00:32<00:44, 1.15it/s][A
+
44%|████████████████████████████████████████████████████▍ | 40/90 [00:33<00:42, 1.17it/s][A
+
46%|█████████████████████████████████████████████████████▊ | 41/90 [00:34<00:42, 1.15it/s][A
+
47%|███████████████████████████████████████████████████████ | 42/90 [00:35<00:40, 1.17it/s][A
+
48%|████████████████████████████████████████████████████████▍ | 43/90 [00:36<00:40, 1.15it/s][A
+
49%|█████████████████████████████████████████████████████████▋ | 44/90 [00:37<00:39, 1.17it/s][A
+
50%|███████████████████████████████████████████████████████████ | 45/90 [00:38<00:38, 1.15it/s][A
+
51%|████████████████████████████████████████████████████████████▎ | 46/90 [00:38<00:37, 1.17it/s][A
+
52%|█████████████████████████████████████████████████████████████▌ | 47/90 [00:39<00:37, 1.16it/s][A
+
53%|██████████████████████████████████████████████████████████████▉ | 48/90 [00:40<00:35, 1.17it/s][A
+
54%|████████████████████████████████████████████████████████████████▏ | 49/90 [00:41<00:35, 1.15it/s][A
+
56%|█████████████████████████████████████████████████████████████████▌ | 50/90 [00:42<00:34, 1.18it/s][A
+
57%|██████████████████████████████████████████████████████████████████▊ | 51/90 [00:43<00:33, 1.16it/s][A
+
58%|████████████████████████████████████████████████████████████████████▏ | 52/90 [00:44<00:32, 1.17it/s][A
+
59%|█████████████████████████████████████████████████████████████████████▍ | 53/90 [00:44<00:32, 1.15it/s][A
+
60%|█████████████████████���████████████████████████████████████████████████▊ | 54/90 [00:45<00:30, 1.17it/s][A
+
61%|████████████████████████████████████████████████████████████████████████ | 55/90 [00:46<00:30, 1.15it/s][A
+
62%|█████████████████████████████████████████████████████████████████████████▍ | 56/90 [00:47<00:28, 1.17it/s][A
+
63%|██████████████████████████████████████████████████████████████████████████▋ | 57/90 [00:48<00:28, 1.14it/s][A
+
64%|████████████████████████████████████████████████████████████████████████████ | 58/90 [00:49<00:27, 1.17it/s][A
+
66%|█████████████████████████████████████████████████████████████████████████████▎ | 59/90 [00:50<00:27, 1.13it/s][A
+
67%|██████████████████████████████████████████████████████████████████████████████▋ | 60/90 [00:51<00:25, 1.17it/s][A
+
68%|███████████████████████████████████████████████████████████████████████████████▉ | 61/90 [00:51<00:25, 1.13it/s][A
+
69%|█████████████████████████████████████████████████████████████████████████████████▎ | 62/90 [00:52<00:24, 1.16it/s][A
+
70%|██████████████████████████████████████████████████████████████████████████████████▌ | 63/90 [00:53<00:23, 1.13it/s][A
+
71%|███████████████████████████████████████████████████████████████████████████████████▉ | 64/90 [00:54<00:22, 1.16it/s][A
+
72%|█████████████████████████████████████████████████████████████████████████████████████▏ | 65/90 [00:55<00:22, 1.13it/s][A
+
73%|██████████████████████████████████████████████████████████████████████████████████████▌ | 66/90 [00:56<00:20, 1.16it/s][A
+
74%|███████████████████████████████████████████████████████████████████████████████████████▊ | 67/90 [00:57<00:20, 1.13it/s][A
+
76%|█████████████████████████████████████████████████████████████████████████████████████████▏ | 68/90 [00:57<00:18, 1.16it/s][A
+
77%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 69/90 [00:58<00:18, 1.14it/s][A
+
78%|█████���█████████████████████████████████████████████████████████████████████████████████████▊ | 70/90 [00:59<00:17, 1.16it/s][A
+
79%|█████████████████████████████████████████████████████████████████████████████████████████████ | 71/90 [01:00<00:16, 1.15it/s][A
+
80%|██████████████████████████████████████████████████████████████████████████████████████████████▍ | 72/90 [01:01<00:15, 1.17it/s][A
+
81%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 73/90 [01:03<00:19, 1.14s/it][A
+
82%|█████████████████████████████████████████████████████████████████████████████████████████████████ | 74/90 [01:04<00:16, 1.03s/it][A
+
83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎ | 75/90 [01:04<00:14, 1.01it/s][A
+
84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋ | 76/90 [01:05<00:13, 1.07it/s][A
+
86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 77/90 [01:06<00:12, 1.08it/s][A
+
87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 78/90 [01:07<00:10, 1.12it/s][A
+
88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 79/90 [01:08<00:09, 1.12it/s][A
+
89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 80/90 [01:09<00:08, 1.15it/s][A
+
90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 81/90 [01:10<00:07, 1.14it/s][A
+
91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 82/90 [01:10<00:06, 1.16it/s][A
+
92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 83/90 [01:11<00:06, 1.14it/s][A
+
93%|█████████████████████���████████████████████████████████████████████████████████████████████████████████████████▏ | 84/90 [01:12<00:05, 1.16it/s][A
+
94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 85/90 [01:13<00:04, 1.15it/s][A
+
96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 86/90 [01:14<00:03, 1.17it/s][A
+
97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 87/90 [01:15<00:02, 1.15it/s][A
+
98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 88/90 [01:16<00:01, 1.17it/s][A
+
99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00, 1.16it/s][A
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.14it/s][A
+
[A{'eval_loss': 0.6468729376792908, 'eval_runtime': 79.9715, 'eval_samples_per_second': 9.128, 'eval_steps_per_second': 2.288, 'eval_ppl': 1.9096, 'memory/max_active (GiB)': 12.83, 'memory/max_allocated (GiB)': 6.85, 'memory/device_reserved (GiB)': 20.07, 'epoch': 0.12}
+
4%|████▌ | 100/2499 [13:19<4:11:44, 6.30s/it]
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:18<00:00, 1.14it/s][A
+
[A
4%|████▌ | 101/2499 [13:25<21:01:55, 31.57s/it]
{'loss': 0.6184, 'grad_norm': 0.17828112840652466, 'learning_rate': 0.00019935547776799467, 'ppl': 1.856, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4614.55, 'total_tokens': 5453331, 'epoch': 0.12}
+
4%|████▌ | 101/2499 [13:25<21:01:55, 31.57s/it]
4%|████▌ | 102/2499 [13:31<15:58:35, 23.99s/it]
{'loss': 0.6822, 'grad_norm': 0.2011706829071045, 'learning_rate': 0.00019934109131805575, 'ppl': 1.9782, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4637.52, 'total_tokens': 5482579, 'epoch': 0.12}
+
4%|████▌ | 102/2499 [13:31<15:58:35, 23.99s/it]
4%|████▌ | 103/2499 [13:38<12:26:29, 18.69s/it]
{'loss': 0.5284, 'grad_norm': 0.13656415045261383, 'learning_rate': 0.00019932654660532548, 'ppl': 1.6962, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4598.33, 'total_tokens': 5511638, 'epoch': 0.12}
+
4%|████▌ | 103/2499 [13:38<12:26:29, 18.69s/it]
4%|████▋ | 104/2499 [13:44<9:57:26, 14.97s/it]
{'loss': 0.6585, 'grad_norm': 0.15870781242847443, 'learning_rate': 0.0001993118436529755, 'ppl': 1.9319, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4352.34, 'total_tokens': 5538918, 'epoch': 0.12}
+
4%|████▋ | 104/2499 [13:44<9:57:26, 14.97s/it]
4%|████▋ | 105/2499 [13:50<8:13:25, 12.37s/it]
{'loss': 0.6339, 'grad_norm': 0.14072741568088531, 'learning_rate': 0.00019929698248442938, 'ppl': 1.8849, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4732.02, 'total_tokens': 5568710, 'epoch': 0.13}
+
4%|████▋ | 105/2499 [13:50<8:13:25, 12.37s/it]
4%|████▊ | 106/2499 [13:56<7:00:22, 10.54s/it]
{'loss': 0.6381, 'grad_norm': 0.14659491181373596, 'learning_rate': 0.00019928196312336285, 'ppl': 1.8929, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4575.0, 'total_tokens': 5597423, 'epoch': 0.13}
+
4%|████▊ | 106/2499 [13:56<7:00:22, 10.54s/it]
4%|████▊ | 107/2499 [14:03<6:09:10, 9.26s/it]
{'loss': 0.6897, 'grad_norm': 0.1409890204668045, 'learning_rate': 0.00019926678559370364, 'ppl': 1.9931, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4498.1, 'total_tokens': 5625629, 'epoch': 0.13}
+
4%|████▊ | 107/2499 [14:03<6:09:10, 9.26s/it]
4%|████▉ | 108/2499 [14:09<5:33:17, 8.36s/it]
{'loss': 0.5941, 'grad_norm': 0.1351788341999054, 'learning_rate': 0.00019925144991963145, 'ppl': 1.8114, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4593.3, 'total_tokens': 5654426, 'epoch': 0.13}
+
4%|████▉ | 108/2499 [14:09<5:33:17, 8.36s/it]
4%|████▉ | 109/2499 [14:15<5:08:46, 7.75s/it]
{'loss': 0.6293, 'grad_norm': 0.1541460007429123, 'learning_rate': 0.00019923595612557793, 'ppl': 1.8763, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.11, 'tokens_per_second_per_gpu': 4634.55, 'total_tokens': 5683721, 'epoch': 0.13}
+
4%|████▉ | 109/2499 [14:15<5:08:46, 7.75s/it]
4%|████▉ | 110/2499 [14:22<4:51:31, 7.32s/it]
{'loss': 0.6673, 'grad_norm': 0.17826059460639954, 'learning_rate': 0.0001992203042362266, 'ppl': 1.949, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4305.69, 'total_tokens': 5710908, 'epoch': 0.13}
+
4%|████▉ | 110/2499 [14:22<4:51:31, 7.32s/it]
4%|█████ | 111/2499 [14:28<4:39:24, 7.02s/it]
{'loss': 0.7005, 'grad_norm': 0.14798669517040253, 'learning_rate': 0.00019920449427651292, 'ppl': 2.0148, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4490.72, 'total_tokens': 5739262, 'epoch': 0.13}
+
4%|█████ | 111/2499 [14:28<4:39:24, 7.02s/it]
4%|█████ | 112/2499 [14:34<4:30:29, 6.80s/it]
{'loss': 0.701, 'grad_norm': 0.14876116812229156, 'learning_rate': 0.00019918852627162412, 'ppl': 2.0158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4544.41, 'total_tokens': 5767800, 'epoch': 0.13}
+
4%|█████ | 112/2499 [14:34<4:30:29, 6.80s/it]
5%|█████ | 113/2499 [14:41<4:24:02, 6.64s/it]
{'loss': 0.6515, 'grad_norm': 0.14015726745128632, 'learning_rate': 0.00019917240024699924, 'ppl': 1.9184, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4582.81, 'total_tokens': 5796516, 'epoch': 0.14}
+
5%|█████ | 113/2499 [14:41<4:24:02, 6.64s/it]
5%|█████▏ | 114/2499 [14:47<4:19:38, 6.53s/it]
{'loss': 0.6357, 'grad_norm': 0.14569461345672607, 'learning_rate': 0.00019915611622832905, 'ppl': 1.8883, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4596.84, 'total_tokens': 5825374, 'epoch': 0.14}
+
5%|█████▏ | 114/2499 [14:47<4:19:38, 6.53s/it]
5%|█████▏ | 115/2499 [14:53<4:16:34, 6.46s/it]
{'loss': 0.6666, 'grad_norm': 0.1522768884897232, 'learning_rate': 0.00019913967424155598, 'ppl': 1.9476, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4636.01, 'total_tokens': 5854490, 'epoch': 0.14}
+
5%|█████▏ | 115/2499 [14:53<4:16:34, 6.46s/it]
5%|█████▏ | 116/2499 [14:59<4:14:25, 6.41s/it]
{'loss': 0.6558, 'grad_norm': 0.15072417259216309, 'learning_rate': 0.00019912307431287427, 'ppl': 1.9267, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4478.85, 'total_tokens': 5882638, 'epoch': 0.14}
+
5%|█████▏ | 116/2499 [14:59<4:14:25, 6.41s/it]
5%|█████▎ | 117/2499 [15:06<4:13:09, 6.38s/it]
{'loss': 0.6541, 'grad_norm': 0.140936478972435, 'learning_rate': 0.0001991063164687296, 'ppl': 1.9234, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4528.6, 'total_tokens': 5911187, 'epoch': 0.14}
+
5%|█████▎ | 117/2499 [15:06<4:13:09, 6.38s/it]
5%|█████▎ | 118/2499 [15:12<4:12:05, 6.35s/it]
{'loss': 0.6191, 'grad_norm': 0.14590787887573242, 'learning_rate': 0.00019908940073581937, 'ppl': 1.8573, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4668.98, 'total_tokens': 5940567, 'epoch': 0.14}
+
5%|█████▎ | 118/2499 [15:12<4:12:05, 6.35s/it]
5%|█████▍ | 119/2499 [15:18<4:11:21, 6.34s/it]
{'loss': 0.6365, 'grad_norm': 0.13646982610225677, 'learning_rate': 0.0001990723271410924, 'ppl': 1.8899, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4828.06, 'total_tokens': 5970969, 'epoch': 0.14}
+
5%|█████▍ | 119/2499 [15:18<4:11:21, 6.34s/it]
5%|█████▍ | 120/2499 [15:25<4:10:27, 6.32s/it]
{'loss': 0.5822, 'grad_norm': 0.1353752613067627, 'learning_rate': 0.00019905509571174914, 'ppl': 1.79, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4639.33, 'total_tokens': 6000051, 'epoch': 0.14}
+
5%|█████▍ | 120/2499 [15:25<4:10:27, 6.32s/it]
5%|█████▍ | 121/2499 [15:31<4:09:58, 6.31s/it]
{'loss': 0.6289, 'grad_norm': 0.17556677758693695, 'learning_rate': 0.00019903770647524137, 'ppl': 1.8755, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4626.75, 'total_tokens': 6029115, 'epoch': 0.15}
+
5%|█████▍ | 121/2499 [15:31<4:09:58, 6.31s/it]
5%|██��██▌ | 122/2499 [15:37<4:09:14, 6.29s/it]
{'loss': 0.6331, 'grad_norm': 0.1434057652950287, 'learning_rate': 0.0001990201594592723, 'ppl': 1.8834, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4451.39, 'total_tokens': 6056947, 'epoch': 0.15}
+
5%|█████▌ | 122/2499 [15:37<4:09:14, 6.29s/it]
5%|█████▌ | 123/2499 [15:43<4:09:19, 6.30s/it]
{'loss': 0.6185, 'grad_norm': 0.14586731791496277, 'learning_rate': 0.00019900245469179655, 'ppl': 1.8561, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.17, 'tokens_per_second_per_gpu': 4505.08, 'total_tokens': 6085351, 'epoch': 0.15}
+
5%|█████▌ | 123/2499 [15:43<4:09:19, 6.30s/it]
5%|█████▌ | 124/2499 [15:50<4:09:00, 6.29s/it]
{'loss': 0.6336, 'grad_norm': 0.15855848789215088, 'learning_rate': 0.00019898459220102002, 'ppl': 1.8844, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4453.23, 'total_tokens': 6113294, 'epoch': 0.15}
+
5%|█████▌ | 124/2499 [15:50<4:09:00, 6.29s/it]
5%|█████▋ | 125/2499 [15:56<4:08:35, 6.28s/it]
{'loss': 0.6083, 'grad_norm': 0.14481675624847412, 'learning_rate': 0.0001989665720153999, 'ppl': 1.8373, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4599.51, 'total_tokens': 6142097, 'epoch': 0.15}
+
5%|█████▋ | 125/2499 [15:56<4:08:35, 6.28s/it]
5%|█████▋ | 126/2499 [16:02<4:08:12, 6.28s/it]
{'loss': 0.6727, 'grad_norm': 0.167931467294693, 'learning_rate': 0.0001989483941636446, 'ppl': 1.9595, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4386.44, 'total_tokens': 6169540, 'epoch': 0.15}
+
5%|█████▋ | 126/2499 [16:02<4:08:12, 6.28s/it]
5%|█████▋ | 127/2499 [16:08<4:08:05, 6.28s/it]
{'loss': 0.601, 'grad_norm': 0.155978262424469, 'learning_rate': 0.00019893005867471374, 'ppl': 1.8239, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4730.58, 'total_tokens': 6199215, 'epoch': 0.15}
+
5%|█████▋ | 127/2499 [16:08<4:08:05, 6.28s/it]
5%|█████▊ | 128/2499 [16:15<4:07:48, 6.27s/it]
{'loss': 0.6443, 'grad_norm': 0.1500401645898819, 'learning_rate': 0.00019891156557781797, 'ppl': 1.9047, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4510.45, 'total_tokens': 6227443, 'epoch': 0.15}
+
5%|█████▊ | 128/2499 [16:15<4:07:48, 6.27s/it]
5%|█████▊ | 129/2499 [16:21<4:07:33, 6.27s/it]
{'loss': 0.6555, 'grad_norm': 0.15343204140663147, 'learning_rate': 0.0001988929149024192, 'ppl': 1.9261, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4447.09, 'total_tokens': 6255262, 'epoch': 0.15}
+
5%|█████▊ | 129/2499 [16:21<4:07:33, 6.27s/it]
5%|█████▉ | 130/2499 [16:27<4:07:53, 6.28s/it]
{'loss': 0.6536, 'grad_norm': 0.18412944674491882, 'learning_rate': 0.00019887410667823022, 'ppl': 1.9224, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4639.29, 'total_tokens': 6284499, 'epoch': 0.16}
+
5%|█████▉ | 130/2499 [16:27<4:07:53, 6.28s/it]
5%|█████▉ | 131/2499 [16:34<4:07:47, 6.28s/it]
{'loss': 0.6874, 'grad_norm': 6.614463806152344, 'learning_rate': 0.00019885514093521495, 'ppl': 1.9885, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4495.37, 'total_tokens': 6312705, 'epoch': 0.16}
+
5%|█████▉ | 131/2499 [16:34<4:07:47, 6.28s/it]
5%|█████▉ | 132/2499 [16:40<4:07:36, 6.28s/it]
{'loss': 0.6402, 'grad_norm': 0.1778506189584732, 'learning_rate': 0.0001988360177035881, 'ppl': 1.8969, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4504.67, 'total_tokens': 6340952, 'epoch': 0.16}
+
5%|█████▉ | 132/2499 [16:40<4:07:36, 6.28s/it]
5%|██████ | 133/2499 [16:46<4:07:13, 6.27s/it]
{'loss': 0.5849, 'grad_norm': 0.15809500217437744, 'learning_rate': 0.00019881673701381547, 'ppl': 1.7948, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4387.74, 'total_tokens': 6368377, 'epoch': 0.16}
+
5%|██████ | 133/2499 [16:46<4:07:13, 6.27s/it]
5%|██████ | 134/2499 [16:52<4:07:04, 6.27s/it]
{'loss': 0.5755, 'grad_norm': 0.16758741438388824, 'learning_rate': 0.00019879729889661353, 'ppl': 1.778, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4713.74, 'total_tokens': 6397901, 'epoch': 0.16}
+
5%|██████ | 134/2499 [16:52<4:07:04, 6.27s/it]
5%|██���███ | 135/2499 [16:59<4:06:57, 6.27s/it]
{'loss': 0.6093, 'grad_norm': 0.17591319978237152, 'learning_rate': 0.00019877770338294973, 'ppl': 1.8391, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4636.12, 'total_tokens': 6426945, 'epoch': 0.16}
+
5%|██████ | 135/2499 [16:59<4:06:57, 6.27s/it]
5%|██████▏ | 136/2499 [17:05<4:06:54, 6.27s/it]
{'loss': 0.6427, 'grad_norm': 0.18837158381938934, 'learning_rate': 0.0001987579505040421, 'ppl': 1.9016, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4433.54, 'total_tokens': 6454744, 'epoch': 0.16}
+
5%|██████▏ | 136/2499 [17:05<4:06:54, 6.27s/it]
5%|██████▏ | 137/2499 [17:11<4:07:08, 6.28s/it]
{'loss': 0.6579, 'grad_norm': 0.1512988954782486, 'learning_rate': 0.00019873804029135942, 'ppl': 1.9307, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4501.54, 'total_tokens': 6483079, 'epoch': 0.16}
+
5%|██████▏ | 137/2499 [17:11<4:07:08, 6.28s/it]
6%|██████▏ | 138/2499 [17:17<4:06:52, 6.27s/it]
{'loss': 0.6406, 'grad_norm': 0.1809886246919632, 'learning_rate': 0.00019871797277662125, 'ppl': 1.8976, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4511.87, 'total_tokens': 6511327, 'epoch': 0.17}
+
6%|██████▏ | 138/2499 [17:17<4:06:52, 6.27s/it]
6%|██████▎ | 139/2499 [17:24<4:07:01, 6.28s/it]
{'loss': 0.6779, 'grad_norm': 0.1574440598487854, 'learning_rate': 0.00019869774799179755, 'ppl': 1.9697, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4721.62, 'total_tokens': 6541034, 'epoch': 0.17}
+
6%|██████▎ | 139/2499 [17:24<4:07:01, 6.28s/it]
6%|██████▎ | 140/2499 [17:30<4:06:55, 6.28s/it]
{'loss': 0.6388, 'grad_norm': 0.16212943196296692, 'learning_rate': 0.00019867736596910902, 'ppl': 1.8942, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4727.92, 'total_tokens': 6570721, 'epoch': 0.17}
+
6%|██████▎ | 140/2499 [17:30<4:06:55, 6.28s/it]
6%|██████▍ | 141/2499 [17:36<4:06:44, 6.28s/it]
{'loss': 0.6901, 'grad_norm': 0.16586321592330933, 'learning_rate': 0.00019865682674102676, 'ppl': 1.9939, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4663.45, 'total_tokens': 6599963, 'epoch': 0.17}
+
6%|██████▍ | 141/2499 [17:36<4:06:44, 6.28s/it]
6%|██████▍ | 142/2499 [17:43<4:06:23, 6.27s/it]
{'loss': 0.6483, 'grad_norm': 0.1520916223526001, 'learning_rate': 0.00019863613034027224, 'ppl': 1.9123, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4568.36, 'total_tokens': 6628544, 'epoch': 0.17}
+
6%|██████▍ | 142/2499 [17:43<4:06:23, 6.27s/it]
6%|██████▍ | 143/2499 [17:49<4:07:00, 6.29s/it]
{'loss': 0.6739, 'grad_norm': 0.17079249024391174, 'learning_rate': 0.00019861527679981752, 'ppl': 1.9619, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4642.42, 'total_tokens': 6657935, 'epoch': 0.17}
+
6%|██████▍ | 143/2499 [17:49<4:07:00, 6.29s/it]
6%|██████▌ | 144/2499 [17:55<4:07:16, 6.30s/it]
{'loss': 0.6213, 'grad_norm': 0.14469042420387268, 'learning_rate': 0.00019859426615288488, 'ppl': 1.8613, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4454.93, 'total_tokens': 6686079, 'epoch': 0.17}
+
6%|██████▌ | 144/2499 [17:55<4:07:16, 6.30s/it]
6%|██████▌ | 145/2499 [18:02<4:06:50, 6.29s/it]
{'loss': 0.6334, 'grad_norm': 0.15830209851264954, 'learning_rate': 0.00019857309843294684, 'ppl': 1.884, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4440.19, 'total_tokens': 6713910, 'epoch': 0.17}
+
6%|██████▌ | 145/2499 [18:02<4:06:50, 6.29s/it]
6%|██████▌ | 146/2499 [18:08<4:06:32, 6.29s/it]
{'loss': 0.6419, 'grad_norm': 0.15467514097690582, 'learning_rate': 0.00019855177367372634, 'ppl': 1.9001, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4476.41, 'total_tokens': 6741989, 'epoch': 0.18}
+
6%|██████▌ | 146/2499 [18:08<4:06:32, 6.29s/it]
6%|██████▋ | 147/2499 [18:14<4:06:15, 6.28s/it]
{'loss': 0.6124, 'grad_norm': 0.14238551259040833, 'learning_rate': 0.0001985302919091963, 'ppl': 1.8449, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4580.51, 'total_tokens': 6770703, 'epoch': 0.18}
+
6%|██████▋ | 147/2499 [18:14<4:06:15, 6.28s/it]
6%|██████▋ | 148/2499 [18:20<4:06:06, 6.28s/it]
{'loss': 0.6293, 'grad_norm': 0.16102945804595947, 'learning_rate': 0.00019850865317357988, 'ppl': 1.8763, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4631.61, 'total_tokens': 6799769, 'epoch': 0.18}
+
6%|██████▋ | 148/2499 [18:20<4:06:06, 6.28s/it]
6%|██████▋ | 149/2499 [18:27<4:05:48, 6.28s/it]
{'loss': 0.6808, 'grad_norm': 0.1688845455646515, 'learning_rate': 0.00019848685750135033, 'ppl': 1.9755, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4488.35, 'total_tokens': 6827878, 'epoch': 0.18}
+
6%|██████▋ | 149/2499 [18:27<4:05:48, 6.28s/it]
6%|██████▊ | 150/2499 [18:33<4:05:57, 6.28s/it]
{'loss': 0.6459, 'grad_norm': 0.14278124272823334, 'learning_rate': 0.00019846490492723084, 'ppl': 1.9077, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4585.29, 'total_tokens': 6856742, 'epoch': 0.18}
+
6%|██████▊ | 150/2499 [18:33<4:05:57, 6.28s/it]
6%|██████▊ | 151/2499 [18:39<4:06:23, 6.30s/it]
{'loss': 0.6847, 'grad_norm': 0.1538703888654709, 'learning_rate': 0.0001984427954861946, 'ppl': 1.9832, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4651.06, 'total_tokens': 6886155, 'epoch': 0.18}
+
6%|██████▊ | 151/2499 [18:39<4:06:23, 6.30s/it]
6%|██████▊ | 152/2499 [18:46<4:06:17, 6.30s/it]
{'loss': 0.6242, 'grad_norm': 0.15251557528972626, 'learning_rate': 0.00019842052921346479, 'ppl': 1.8668, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4760.27, 'total_tokens': 6916113, 'epoch': 0.18}
+
6%|██████▊ | 152/2499 [18:46<4:06:17, 6.30s/it]
6%|██████▉ | 153/2499 [18:52<4:06:09, 6.30s/it]
{'loss': 0.6634, 'grad_norm': 0.15581682324409485, 'learning_rate': 0.00019839810614451434, 'ppl': 1.9414, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4622.18, 'total_tokens': 6945193, 'epoch': 0.18}
+
6%|██████▉ | 153/2499 [18:52<4:06:09, 6.30s/it]
6%|██████▉ | 154/2499 [18:58<4:05:45, 6.29s/it]
{'loss': 0.6208, 'grad_norm': 0.14313741028308868, 'learning_rate': 0.00019837552631506592, 'ppl': 1.8604, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4588.89, 'total_tokens': 6973954, 'epoch': 0.18}
+
6%|██████▉ | 154/2499 [18:58<4:05:45, 6.29s/it]
6%|███████ | 155/2499 [19:04<4:05:27, 6.28s/it]
{'loss': 0.652, 'grad_norm': 0.14645761251449585, 'learning_rate': 0.00019835278976109214, 'ppl': 1.9194, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4630.67, 'total_tokens': 7002988, 'epoch': 0.19}
+
6%|███████ | 155/2499 [19:04<4:05:27, 6.28s/it]
6%|███████ | 156/2499 [19:11<4:05:11, 6.28s/it]
{'loss': 0.6053, 'grad_norm': 0.1450553685426712, 'learning_rate': 0.0001983298965188151, 'ppl': 1.8318, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4535.2, 'total_tokens': 7031406, 'epoch': 0.19}
+
6%|███████ | 156/2499 [19:11<4:05:11, 6.28s/it]
6%|███████ | 157/2499 [19:17<4:05:20, 6.29s/it]
{'loss': 0.6132, 'grad_norm': 0.14832331240177155, 'learning_rate': 0.00019830684662470663, 'ppl': 1.8463, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4542.32, 'total_tokens': 7060015, 'epoch': 0.19}
+
6%|███████ | 157/2499 [19:17<4:05:20, 6.29s/it]
6%|███████▏ | 158/2499 [19:23<4:05:24, 6.29s/it]
{'loss': 0.6337, 'grad_norm': 0.15093529224395752, 'learning_rate': 0.0001982836401154881, 'ppl': 1.8846, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4563.48, 'total_tokens': 7088745, 'epoch': 0.19}
+
6%|███████▏ | 158/2499 [19:23<4:05:24, 6.29s/it]
6%|███████▏ | 159/2499 [19:30<4:05:10, 6.29s/it]
{'loss': 0.6969, 'grad_norm': 0.16975665092468262, 'learning_rate': 0.00019826027702813038, 'ppl': 2.0075, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4595.17, 'total_tokens': 7117583, 'epoch': 0.19}
+
6%|███████▏ | 159/2499 [19:30<4:05:10, 6.29s/it]
6%|███████▏ | 160/2499 [19:36<4:04:55, 6.28s/it]
{'loss': 0.6083, 'grad_norm': 0.1516297310590744, 'learning_rate': 0.00019823675739985376, 'ppl': 1.8373, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4718.55, 'total_tokens': 7147178, 'epoch': 0.19}
+
6%|███████▏ | 160/2499 [19:36<4:04:55, 6.28s/it]
6%|███████▎ | 161/2499 [19:42<4:04:46, 6.28s/it]
{'loss': 0.6185, 'grad_norm': 0.14229127764701843, 'learning_rate': 0.00019821308126812803, 'ppl': 1.8561, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4711.98, 'total_tokens': 7176755, 'epoch': 0.19}
+
6%|███████▎ | 161/2499 [19:42<4:04:46, 6.28s/it]
6%|███████▎ | 162/2499 [19:48<4:04:20, 6.27s/it]
{'loss': 0.6169, 'grad_norm': 0.17252376675605774, 'learning_rate': 0.00019818924867067214, 'ppl': 1.8532, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4502.44, 'total_tokens': 7204896, 'epoch': 0.19}
+
6%|███████▎ | 162/2499 [19:48<4:04:20, 6.27s/it]
7%|███████▎ | 163/2499 [19:55<4:04:10, 6.27s/it]
{'loss': 0.6053, 'grad_norm': 0.15479132533073425, 'learning_rate': 0.00019816525964545448, 'ppl': 1.8318, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4656.1, 'total_tokens': 7234072, 'epoch': 0.2}
+
7%|███████▎ | 163/2499 [19:55<4:04:10, 6.27s/it]
7%|███████▍ | 164/2499 [20:01<4:04:32, 6.28s/it]
{'loss': 0.6358, 'grad_norm': 0.1458706557750702, 'learning_rate': 0.0001981411142306925, 'ppl': 1.8885, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4626.2, 'total_tokens': 7263258, 'epoch': 0.2}
+
7%|███████▍ | 164/2499 [20:01<4:04:32, 6.28s/it]
7%|███████▍ | 165/2499 [20:07<4:04:24, 6.28s/it]
{'loss': 0.5665, 'grad_norm': 0.1417934000492096, 'learning_rate': 0.0001981168124648529, 'ppl': 1.7621, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4551.14, 'total_tokens': 7291824, 'epoch': 0.2}
+
7%|███████▍ | 165/2499 [20:07<4:04:24, 6.28s/it]
7%|███████▌ | 166/2499 [20:13<4:04:06, 6.28s/it]
{'loss': 0.6314, 'grad_norm': 0.1490688920021057, 'learning_rate': 0.00019809235438665143, 'ppl': 1.8802, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4564.55, 'total_tokens': 7320418, 'epoch': 0.2}
+
7%|███████▌ | 166/2499 [20:13<4:04:06, 6.28s/it]
7%|███████▌ | 167/2499 [20:20<4:03:43, 6.27s/it]
{'loss': 0.6009, 'grad_norm': 0.1549319177865982, 'learning_rate': 0.0001980677400350529, 'ppl': 1.8238, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4506.43, 'total_tokens': 7348591, 'epoch': 0.2}
+
7%|███████▌ | 167/2499 [20:20<4:03:43, 6.27s/it]
7%|███████▌ | 168/2499 [20:26<4:03:35, 6.27s/it]
{'loss': 0.582, 'grad_norm': 0.1679680198431015, 'learning_rate': 0.000198042969449271, 'ppl': 1.7896, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4706.82, 'total_tokens': 7378083, 'epoch': 0.2}
+
7%|███████▌ | 168/2499 [20:26<4:03:35, 6.27s/it]
7%|███████▋ | 169/2499 [20:32<4:03:31, 6.27s/it]
{'loss': 0.6688, 'grad_norm': 0.16335871815681458, 'learning_rate': 0.0001980180426687684, 'ppl': 1.9519, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4740.61, 'total_tokens': 7407810, 'epoch': 0.2}
+
7%|███████▋ | 169/2499 [20:32<4:03:31, 6.27s/it]
7%|███████▋ | 170/2499 [20:39<4:03:31, 6.27s/it]
{'loss': 0.5984, 'grad_norm': 0.15233907103538513, 'learning_rate': 0.00019799295973325657, 'ppl': 1.8192, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4701.67, 'total_tokens': 7437325, 'epoch': 0.2}
+
7%|███████▋ | 170/2499 [20:39<4:03:31, 6.27s/it]
7%|███████▋ | 171/2499 [20:45<4:03:37, 6.28s/it]
{'loss': 0.6533, 'grad_norm': 0.14838764071464539, 'learning_rate': 0.0001979677206826958, 'ppl': 1.9219, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4422.84, 'total_tokens': 7465136, 'epoch': 0.21}
+
7%|███████▋ | 171/2499 [20:45<4:03:37, 6.28s/it]
7%|███████▊ | 172/2499 [20:51<4:03:32, 6.28s/it]
{'loss': 0.5928, 'grad_norm': 0.1395515352487564, 'learning_rate': 0.000197942325557295, 'ppl': 1.809, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4613.25, 'total_tokens': 7494094, 'epoch': 0.21}
+
7%|███████▊ | 172/2499 [20:51<4:03:32, 6.28s/it]
7%|███████▊ | 173/2499 [20:57<4:03:08, 6.27s/it]
{'loss': 0.6164, 'grad_norm': 0.14091241359710693, 'learning_rate': 0.00019791677439751185, 'ppl': 1.8522, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4590.45, 'total_tokens': 7522794, 'epoch': 0.21}
+
7%|███████▊ | 173/2499 [20:57<4:03:08, 6.27s/it]
7%|███████▊ | 174/2499 [21:04<4:02:50, 6.27s/it]
{'loss': 0.6732, 'grad_norm': 0.16553938388824463, 'learning_rate': 0.0001978910672440525, 'ppl': 1.9605, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4550.86, 'total_tokens': 7551247, 'epoch': 0.21}
+
7%|███████▊ | 174/2499 [21:04<4:02:50, 6.27s/it]
7%|███████▉ | 175/2499 [21:10<4:02:44, 6.27s/it]
{'loss': 0.6298, 'grad_norm': 0.15987837314605713, 'learning_rate': 0.00019786520413787165, 'ppl': 1.8772, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4506.6, 'total_tokens': 7579481, 'epoch': 0.21}
+
7%|███████▉ | 175/2499 [21:10<4:02:44, 6.27s/it]
7%|███████▉ | 176/2499 [21:16<4:02:42, 6.27s/it]
{'loss': 0.6511, 'grad_norm': 0.14235079288482666, 'learning_rate': 0.00019783918512017253, 'ppl': 1.9176, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4668.32, 'total_tokens': 7608756, 'epoch': 0.21}
+
7%|███████▉ | 176/2499 [21:16<4:02:42, 6.27s/it]
7%|████████ | 177/2499 [21:22<4:03:16, 6.29s/it]
{'loss': 0.6042, 'grad_norm': 0.17243558168411255, 'learning_rate': 0.0001978130102324066, 'ppl': 1.8298, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4565.31, 'total_tokens': 7637623, 'epoch': 0.21}
+
7%|████████ | 177/2499 [21:22<4:03:16, 6.29s/it]
7%|████████ | 178/2499 [21:29<4:03:26, 6.29s/it]
{'loss': 0.637, 'grad_norm': 0.16263476014137268, 'learning_rate': 0.00019778667951627382, 'ppl': 1.8908, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4526.74, 'total_tokens': 7666166, 'epoch': 0.21}
+
7%|████████ | 178/2499 [21:29<4:03:26, 6.29s/it]
7%|████████ | 179/2499 [21:35<4:03:04, 6.29s/it]
{'loss': 0.6186, 'grad_norm': 0.15282128751277924, 'learning_rate': 0.00019776019301372225, 'ppl': 1.8563, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4439.86, 'total_tokens': 7693990, 'epoch': 0.21}
+
7%|████████ | 179/2499 [21:35<4:03:04, 6.29s/it]
7%|████████▏ | 180/2499 [21:41<4:02:51, 6.28s/it]
{'loss': 0.6161, 'grad_norm': 0.14302721619606018, 'learning_rate': 0.00019773355076694826, 'ppl': 1.8517, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4753.91, 'total_tokens': 7723820, 'epoch': 0.22}
+
7%|████████▏ | 180/2499 [21:41<4:02:51, 6.28s/it]
7%|████████▏ | 181/2499 [21:48<4:02:32, 6.28s/it]
{'loss': 0.6521, 'grad_norm': 0.1567981094121933, 'learning_rate': 0.00019770675281839624, 'ppl': 1.9196, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4552.84, 'total_tokens': 7752331, 'epoch': 0.22}
+
7%|████████▏ | 181/2499 [21:48<4:02:32, 6.28s/it]
7%|████████▏ | 182/2499 [21:54<4:02:22, 6.28s/it]
{'loss': 0.6502, 'grad_norm': 0.16891400516033173, 'learning_rate': 0.00019767979921075866, 'ppl': 1.9159, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4489.31, 'total_tokens': 7780479, 'epoch': 0.22}
+
7%|████████▏ | 182/2499 [21:54<4:02:22, 6.28s/it]
7%|████████▎ | 183/2499 [22:00<4:02:16, 6.28s/it]
{'loss': 0.62, 'grad_norm': 0.15879429876804352, 'learning_rate': 0.00019765268998697604, 'ppl': 1.8589, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4588.46, 'total_tokens': 7809267, 'epoch': 0.22}
+
7%|████████▎ | 183/2499 [22:00<4:02:16, 6.28s/it]
7%|████████▎ | 184/2499 [22:06<4:02:26, 6.28s/it]
{'loss': 0.6378, 'grad_norm': 0.1598796546459198, 'learning_rate': 0.00019762542519023674, 'ppl': 1.8923, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4576.5, 'total_tokens': 7838088, 'epoch': 0.22}
+
7%|████████▎ | 184/2499 [22:06<4:02:26, 6.28s/it]
7%|████████▎ | 185/2499 [22:13<4:02:51, 6.30s/it]
{'loss': 0.5584, 'grad_norm': 0.1714273989200592, 'learning_rate': 0.00019759800486397703, 'ppl': 1.7479, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4679.83, 'total_tokens': 7867688, 'epoch': 0.22}
+
7%|████████▎ | 185/2499 [22:13<4:02:51, 6.30s/it]
7%|████████▍ | 186/2499 [22:19<4:02:17, 6.29s/it]
{'loss': 0.6139, 'grad_norm': 0.16586022078990936, 'learning_rate': 0.00019757042905188088, 'ppl': 1.8476, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4281.2, 'total_tokens': 7894459, 'epoch': 0.22}
+
7%|████████▍ | 186/2499 [22:19<4:02:17, 6.29s/it]
7%|████████▍ | 187/2499 [22:25<4:02:07, 6.28s/it]
{'loss': 0.6282, 'grad_norm': 0.1663977950811386, 'learning_rate': 0.00019754269779788, 'ppl': 1.8742, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4602.7, 'total_tokens': 7923350, 'epoch': 0.22}
+
7%|████████▍ | 187/2499 [22:25<4:02:07, 6.28s/it]
8%|████████▌ | 188/2499 [22:32<4:01:49, 6.28s/it]
{'loss': 0.5851, 'grad_norm': 0.1668008416891098, 'learning_rate': 0.0001975148111461538, 'ppl': 1.7952, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4571.75, 'total_tokens': 7951987, 'epoch': 0.23}
+
8%|████████▌ | 188/2499 [22:32<4:01:49, 6.28s/it]
8%|████████▌ | 189/2499 [22:38<4:01:32, 6.27s/it]
{'loss': 0.626, 'grad_norm': 0.18379661440849304, 'learning_rate': 0.00019748676914112915, 'ppl': 1.8701, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4556.79, 'total_tokens': 7980520, 'epoch': 0.23}
+
8%|████████▌ | 189/2499 [22:38<4:01:32, 6.27s/it]
8%|████████▌ | 190/2499 [22:44<4:01:34, 6.28s/it]
{'loss': 0.5925, 'grad_norm': 0.13806037604808807, 'learning_rate': 0.00019745857182748054, 'ppl': 1.8085, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4700.59, 'total_tokens': 8010056, 'epoch': 0.23}
+
8%|████████▌ | 190/2499 [22:44<4:01:34, 6.28s/it]
8%|████████▋ | 191/2499 [22:50<4:01:44, 6.28s/it]
{'loss': 0.6313, 'grad_norm': 0.14297842979431152, 'learning_rate': 0.00019743021925012973, 'ppl': 1.8801, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4553.45, 'total_tokens': 8038737, 'epoch': 0.23}
+
8%|████████▋ | 191/2499 [22:50<4:01:44, 6.28s/it]
8%|████████▋ | 192/2499 [22:57<4:02:09, 6.30s/it]
{'loss': 0.6269, 'grad_norm': 0.16967882215976715, 'learning_rate': 0.000197401711454246, 'ppl': 1.8718, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4698.64, 'total_tokens': 8068455, 'epoch': 0.23}
+
8%|████████▋ | 192/2499 [22:57<4:02:09, 6.30s/it]
8%|████████▋ | 193/2499 [23:03<4:01:52, 6.29s/it]
{'loss': 0.5853, 'grad_norm': 0.15979325771331787, 'learning_rate': 0.0001973730484852458, 'ppl': 1.7955, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4662.01, 'total_tokens': 8097728, 'epoch': 0.23}
+
8%|████████▋ | 193/2499 [23:03<4:01:52, 6.29s/it]
8%|████████▊ | 194/2499 [23:09<4:01:24, 6.28s/it]
{'loss': 0.6389, 'grad_norm': 0.1816360056400299, 'learning_rate': 0.00019734423038879283, 'ppl': 1.8944, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4387.73, 'total_tokens': 8125195, 'epoch': 0.23}
+
8%|████████▊ | 194/2499 [23:09<4:01:24, 6.28s/it]
8%|████████▊ | 195/2499 [23:16<4:01:06, 6.28s/it]
{'loss': 0.5965, 'grad_norm': 0.14533467590808868, 'learning_rate': 0.00019731525721079793, 'ppl': 1.8158, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4552.24, 'total_tokens': 8153711, 'epoch': 0.23}
+
8%|████████▊ | 195/2499 [23:16<4:01:06, 6.28s/it]
8%|████████▊ | 196/2499 [23:22<4:00:54, 6.28s/it]
{'loss': 0.6688, 'grad_norm': 0.16294941306114197, 'learning_rate': 0.000197286128997419, 'ppl': 1.9519, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4513.63, 'total_tokens': 8182000, 'epoch': 0.24}
+
8%|████████▊ | 196/2499 [23:22<4:00:54, 6.28s/it]
8%|████████▉ | 197/2499 [23:28<4:00:37, 6.27s/it]
{'loss': 0.638, 'grad_norm': 0.15876515209674835, 'learning_rate': 0.00019725684579506095, 'ppl': 1.8927, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4516.46, 'total_tokens': 8210270, 'epoch': 0.24}
+
8%|████████▉ | 197/2499 [23:28<4:00:37, 6.27s/it]
8%|████████▉ | 198/2499 [23:34<4:01:01, 6.28s/it]
{'loss': 0.6244, 'grad_norm': 0.1551365852355957, 'learning_rate': 0.00019722740765037555, 'ppl': 1.8671, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4607.89, 'total_tokens': 8239361, 'epoch': 0.24}
+
8%|████████▉ | 198/2499 [23:34<4:01:01, 6.28s/it]
8%|████████▉ | 199/2499 [23:41<4:01:12, 6.29s/it]
{'loss': 0.6537, 'grad_norm': 0.15418943762779236, 'learning_rate': 0.00019719781461026146, 'ppl': 1.9226, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4639.8, 'total_tokens': 8268621, 'epoch': 0.24}
+
8%|████████▉ | 199/2499 [23:41<4:01:12, 6.29s/it]
8%|█████████ | 200/2499 [23:47<4:00:55, 6.29s/it]
{'loss': 0.6467, 'grad_norm': 0.15851524472236633, 'learning_rate': 0.00019716806672186412, 'ppl': 1.9092, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4664.04, 'total_tokens': 8297884, 'epoch': 0.24}
+
8%|█████████ | 200/2499 [23:47<4:00:55, 6.29s/it][2025-12-28 11:29:23,624] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step...
+[2025-12-28 11:29:25,368] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8519337177276611
+[2025-12-28 11:29:26,219] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8512239456176758
+[2025-12-28 11:29:27,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8861675262451172
+[2025-12-28 11:29:27,946] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8390281200408936
+[2025-12-28 11:29:27,947] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90]
+
+
0%| | 0/90 [00:00, ?it/s][A
+
2%|██▋ | 2/90 [00:00<00:36, 2.41it/s][A
+
3%|███▉ | 3/90 [00:01<00:54, 1.58it/s][A
+
4%|█████▎ | 4/90 [00:02<01:00, 1.43it/s][A
+
6%|██████▌ | 5/90 [00:03<01:05, 1.30it/s][A
+
7%|███████▉ | 6/90 [00:04<01:05, 1.27it/s][A
+
8%|█████████▎ | 7/90 [00:05<01:08, 1.22it/s][A
+
9%|██████████▌ | 8/90 [00:06<01:07, 1.22it/s][A
+
10%|███████████▉ | 9/90 [00:06<01:08, 1.19it/s][A
+
11%|█████████████ | 10/90 [00:07<01:06, 1.20it/s][A
+
12%|██████████████▍ | 11/90 [00:08<01:07, 1.17it/s][A
+
13%|███████████████▋ | 12/90 [00:09<01:05, 1.19it/s][A
+
14%|█████████████████ | 13/90 [00:10<01:06, 1.16it/s][A
+
16%|██████████████████▎ | 14/90 [00:11<01:04, 1.18it/s][A
+
17%|███████████████████▋ | 15/90 [00:12<01:04, 1.16it/s][A
+
18%|████████████████████▉ | 16/90 [00:12<01:02, 1.18it/s][A
+
19%|██████████████████████▎ | 17/90 [00:13<01:02, 1.16it/s][A
+
20%|███████████████████████▌ | 18/90 [00:14<01:01, 1.18it/s][A
+
21%|████████████████████████▉ | 19/90 [00:15<01:01, 1.16it/s][A
+
22%|██████████████████████████▏ | 20/90 [00:16<00:59, 1.18it/s][A
+
23%|███████████████████████████▌ | 21/90 [00:17<00:59, 1.15it/s][A
+
24%|████████████████████████████▊ | 22/90 [00:18<00:58, 1.17it/s][A
+
26%|██████████████████████████████▏ | 23/90 [00:18<00:58, 1.15it/s][A
+
27%|███████████████████████████████▍ | 24/90 [00:19<00:56, 1.17it/s][A
+
28%|████████████████████████████████▊ | 25/90 [00:20<00:56, 1.15it/s][A
+
29%|██████████████████████████████████ | 26/90 [00:21<00:54, 1.17it/s][A
+
30%|███████████████████████████████████▍ | 27/90 [00:22<00:55, 1.14it/s][A
+
31%|████████████████████████████████████▋ | 28/90 [00:23<00:53, 1.17it/s][A
+
32%|██████████████████████████████████████ | 29/90 [00:24<00:53, 1.13it/s][A
+
33%|███████████████████████████████████████▎ | 30/90 [00:24<00:51, 1.16it/s][A
+
34%|████████████████████████████████████████▋ | 31/90 [00:25<00:51, 1.14it/s][A
+
36%|█████████████████████████████████████████▉ | 32/90 [00:26<00:49, 1.16it/s][A
+
37%|███████████████████████████████████████████▎ | 33/90 [00:27<00:50, 1.13it/s][A
+
38%|████████████████████████████████████████████▌ | 34/90 [00:28<00:48, 1.16it/s][A
+
39%|█████████████████████████████████████████████▉ | 35/90 [00:29<00:48, 1.13it/s][A
+
40%|███████████████████████████████████████████████▏ | 36/90 [00:30<00:46, 1.17it/s][A
+
41%|████████████████████████████████████████████████▌ | 37/90 [00:31<00:46, 1.15it/s][A
+
42%|█████████████████████████████████████████████████▊ | 38/90 [00:31<00:44, 1.17it/s][A
+
43%|███████████████████���███████████████████████████████▏ | 39/90 [00:32<00:44, 1.15it/s][A
+
44%|████████████████████████████████████████████████████▍ | 40/90 [00:33<00:42, 1.17it/s][A
+
46%|█████████████████████████████████████████████████████▊ | 41/90 [00:34<00:42, 1.15it/s][A
+
47%|███████████████████████████████████████████████████████ | 42/90 [00:35<00:41, 1.17it/s][A
+
48%|████████████████████████████████████████████████████████▍ | 43/90 [00:36<00:40, 1.15it/s][A
+
49%|█████████████████████████████████████████████████████████▋ | 44/90 [00:37<00:39, 1.16it/s][A
+
50%|███████████████████████████████████████████████████████████ | 45/90 [00:38<00:39, 1.14it/s][A
+
51%|████████████████████████████████████████████████████████████▎ | 46/90 [00:38<00:37, 1.16it/s][A
+
52%|█████████████████████████████████████████████████████████████▌ | 47/90 [00:39<00:37, 1.15it/s][A
+
53%|██████████████████████████████████████████████████████████████▉ | 48/90 [00:40<00:35, 1.17it/s][A
+
54%|████████████████████████████████████████████████████████████████▏ | 49/90 [00:41<00:35, 1.15it/s][A
+
56%|█████████████████████████████████████████████████████████████████▌ | 50/90 [00:42<00:34, 1.17it/s][A
+
57%|██████████████████████████████████████████████████████████████████▊ | 51/90 [00:43<00:34, 1.14it/s][A
+
58%|████████████████████████████████████████████████████████████████████▏ | 52/90 [00:43<00:32, 1.17it/s][A
+
59%|█████████████████████████████████████████████████████████████████████▍ | 53/90 [00:44<00:32, 1.15it/s][A
+
60%|██████████████████████████████████████████████████████████████████████▊ | 54/90 [00:45<00:30, 1.17it/s][A
+
61%|████████████████████████████████████████████████████████████████████████ | 55/90 [00:47<00:34, 1.01it/s][A
+
62%|█████████████████████████████████████████████████████████████████████████▍ | 56/90 [00:47<00:28, 1.18it/s][A
+
63%|██████████████████████████████████████████████████████████████████████████▋ | 57/90 [00:48<00:28, 1.16it/s][A
+
64%|████████████████████████████████████████████████████████████████████████████ | 58/90 [00:49<00:27, 1.18it/s][A
+
66%|█████████████████████████████████████████████████████████████████████████████▎ | 59/90 [00:50<00:26, 1.16it/s][A
+
67%|██████████████████████████████████████████████████████████████████████████████▋ | 60/90 [00:50<00:25, 1.18it/s][A
+
68%|███████████████████████████████████████████████████████████████████████████████▉ | 61/90 [00:51<00:25, 1.15it/s][A
+
69%|█████████████████████████████████████████████████████████████████████████████████▎ | 62/90 [00:52<00:23, 1.17it/s][A
+
70%|██████████████████████████████████████████████████████████████████████████████████▌ | 63/90 [00:53<00:26, 1.02it/s][A
+
71%|███████████████████████████████████████████████████████████████████████████████████▉ | 64/90 [00:54<00:24, 1.07it/s][A
+
72%|█████████████████████████████████████████████████████████████████████████████████████▏ | 65/90 [00:55<00:23, 1.09it/s][A
+
73%|██████████████████████████████████████████████████████████████████████████████████████▌ | 66/90 [00:56<00:21, 1.12it/s][A
+
74%|███████████████████████████████████████████████████████████████████████████████████████▊ | 67/90 [00:57<00:20, 1.12it/s][A
+
76%|█████████████████████████████████████████████████████████████████████████████████████████▏ | 68/90 [00:58<00:19, 1.15it/s][A
+
77%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 69/90 [00:59<00:18, 1.14it/s][A
+
78%|███████████████████████████████████████████████████████████████████████████████████████████▊ | 70/90 [00:59<00:17, 1.16it/s][A
+
79%|█████████████████████████████████████████████████████████████████████████████████████████████ | 71/90 [01:00<00:16, 1.15it/s][A
+
80%|██████████████████████████████████████████████████████████████████████████████████████████████▍ | 72/90 [01:01<00:15, 1.17it/s][A
+
81%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 73/90 [01:02<00:14, 1.14it/s][A
+
82%|█████████████████████████████████████████████████████████████████████████████████████████████████ | 74/90 [01:03<00:13, 1.17it/s][A
+
83%|██████████████████████████████████████████████████████████████████████████████████████████████████▎ | 75/90 [01:04<00:13, 1.14it/s][A
+
84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋ | 76/90 [01:05<00:11, 1.17it/s][A
+
86%|████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 77/90 [01:06<00:11, 1.14it/s][A
+
87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 78/90 [01:06<00:10, 1.17it/s][A
+
88%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 79/90 [01:07<00:09, 1.14it/s][A
+
89%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 80/90 [01:08<00:08, 1.17it/s][A
+
90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 81/90 [01:09<00:07, 1.14it/s][A
+
91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 82/90 [01:10<00:06, 1.17it/s][A
+
92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 83/90 [01:11<00:06, 1.14it/s][A
+
93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 84/90 [01:12<00:05, 1.16it/s][A
+
94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 85/90 [01:12<00:04, 1.14it/s][A
+
96%|█████████████████████████████████████████████████████████��██████████████████████████████████████████████████████▊ | 86/90 [01:13<00:03, 1.16it/s][A
+
97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 87/90 [01:14<00:02, 1.15it/s][A
+
98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 88/90 [01:15<00:01, 1.17it/s][A
+
99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 89/90 [01:16<00:00, 1.15it/s][A
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.15it/s][A
+
[A{'eval_loss': 0.6098045110702515, 'eval_runtime': 79.6449, 'eval_samples_per_second': 9.166, 'eval_steps_per_second': 2.298, 'eval_ppl': 1.8401, 'memory/max_active (GiB)': 12.83, 'memory/max_allocated (GiB)': 6.85, 'memory/device_reserved (GiB)': 20.19, 'epoch': 0.24}
+
8%|█████████ | 200/2499 [25:11<4:00:55, 6.29s/it]
+
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.15it/s][A
+
[A[2025-12-28 11:30:51,250] [INFO] [axolotl.core.trainers.base._save:692] [PID:42410] Saving model checkpoint to ./outputs/luau-codellama-h200/checkpoint-200