[2025-12-13 20:39:02,352] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:1121389] bf16 support detected, enabling for this configuration.
[2025-12-13 20:39:02,354] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1121389] baseline 0.000GB ()
[2025-12-13 20:39:02,354] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1121389] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "train.lima.yml",
  "base_model": "/home/alex/Workspace/sllama/out_5/checkpoint-1722000",
  "base_model_config": "/home/alex/Workspace/sllama/out_5/checkpoint-1722000",
  "batch_size": 1,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_120",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "cosine_constant_lr_ratio": 0.1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 0,
  "dataloader_pin_memory": true,
  "dataset_prepared_path": "data_prep",
  "dataset_processes": 12,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "lima.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "fp8": true,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": false
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_llama_derived_model": true,
  "learning_rate": 0.1,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 10,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "llama",
  "num_epochs": 4.0,
  "optim_args": {
    "momentum": 0.98
  },
  "optimizer": "sgd",
  "output_dir": "./out_6_lima",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resize_token_embeddings_to_32x": true,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": true,
  "save_safetensors": true,
  "save_steps": 100,
  "save_total_limit": 2,
  "sequence_len": 2048,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "/home/alex/Workspace/sllama/out_5/checkpoint-1722000",
  "tokenizer_save_jinja_files": true,
  "torch_compile": true,
  "torch_compile_backend": "inductor",
  "torch_compile_mode": "default",
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "unfrozen_parameters": [
    "^(?![\\s\\S]*embed_tokens)[\\s\\S]+$"
  ],
  "use_ray": false,
  "use_tensorboard": true,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_project": "sllama",
  "warmup_ratio": 0.05,
  "weight_decay": 0.01,
  "world_size": 1
}
[2025-12-13 20:39:02,909] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:1121389] EOS: 128009 / <|eot_id|>
[2025-12-13 20:39:02,909] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1121389] BOS: 128000 / <|begin_of_text|>
[2025-12-13 20:39:02,909] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1121389] PAD: 128004 / <|finetune_right_pad_id|>
[2025-12-13 20:39:02,909] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1121389] UNK: None / None
[2025-12-13 20:39:02,909] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:1121389] Loading prepared dataset from disk at data_prep/0ed3928757e6a5973cddee77ac8c62e0...
[2025-12-13 20:39:02,913] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:1121389] total_num_tokens: 640_837
[2025-12-13 20:39:02,917] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:1121389] `total_supervised_tokens: 445_575`
[2025-12-13 20:39:03,640] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.36017513275146484
[2025-12-13 20:39:03,998] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.35738229751586914
[2025-12-13 20:39:04,353] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.35471415519714355
[2025-12-13 20:39:04,713] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.3598747253417969
[2025-12-13 20:39:04,724] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:1121389] gather_len_batches: [322]
[2025-12-13 20:39:04,724] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:1121389] data_loader_len: 322
[2025-12-13 20:39:04,725] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:1121389] sample_packing_eff_est across ranks: [0.9687575585332817]
[2025-12-13 20:39:04,725] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:1121389] sample_packing_eff_est: 0.97
[2025-12-13 20:39:04,725] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:1121389] total_num_steps: 1288
[2025-12-13 20:39:04,725] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:1121389] Maximum number of steps set at 1288
[2025-12-13 20:39:04,740] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:1121389] Loading tokenizer... /home/alex/Workspace/sllama/out_5/checkpoint-1722000
[2025-12-13 20:39:04,952] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:1121389] EOS: 128009 / <|eot_id|>
[2025-12-13 20:39:04,952] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1121389] BOS: 128000 / <|begin_of_text|>
[2025-12-13 20:39:04,952] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1121389] PAD: 128004 / <|finetune_right_pad_id|>
[2025-12-13 20:39:04,952] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1121389] UNK: None / None
[2025-12-13 20:39:04,952] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:1121389] Loading model
[2025-12-13 20:39:04,955] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1121389] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-12-13 20:39:04,956] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1121389] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-12-13 20:39:04,957] [INFO] [axolotl.monkeypatch.trainer_accelerator_args.patch_create_accelerate_code_for_fp8:80] [PID:1121389] patching create_accelerator_and_postprocess to allow for overrides
[2025-12-13 20:39:04,957] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:294] [PID:1121389] Applying multipack dataloader patch for sample packing...
[2025-12-13 20:39:05,015] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:1121389] Applying Cut Cross Entropy to model type: llama
[2025-12-13 20:39:05,020] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:1121389] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
[2025-12-13 20:39:05,890] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1121389] Converting modules to torch.bfloat16
[2025-12-13 20:39:06,358] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1121389] Memory usage after model load 0.000GB ()
[2025-12-13 20:39:06,858] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.self_attn.q_proj.weight
[2025-12-13 20:39:06,858] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.self_attn.k_proj.weight
[2025-12-13 20:39:06,858] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.self_attn.v_proj.weight
[2025-12-13 20:39:06,858] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.self_attn.o_proj.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.mlp.gate_proj.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.mlp.up_proj.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.mlp.down_proj.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.input_layernorm.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.layers.0.post_attention_layernorm.weight
[2025-12-13 20:39:06,859] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:1121389] Unfrozen model.norm.weight
[2025-12-13 20:39:07,912] [INFO] [axolotl.train.save_initial_configs:412] [PID:1121389] Pre-saving tokenizer to ./out_6_lima...
[2025-12-13 20:39:07,973] [INFO] [axolotl.train.save_initial_configs:417] [PID:1121389] Pre-saving model config to ./out_6_lima...
[2025-12-13 20:39:07,975] [INFO] [axolotl.train.execute_training:203] [PID:1121389] Starting trainer...
[2025-12-13 20:39:09,007] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.37635207176208496
[2025-12-13 20:39:09,392] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.3846709728240967
[2025-12-13 20:39:09,774] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.38227200508117676
[2025-12-13 20:39:10,157] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1121389] generate_batches time: 0.3822507858276367
[2025-12-13 20:39:10,157] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:1121389] gather_len_batches: [323]
[34m[1mwandb[0m: Currently logged in as: [33malex-ht[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.21.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/alex/Workspace/sllama/wandb/run-20251213_203911-00ktlhat[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstep6_lima[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/alex-ht/sllama[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/alex-ht/sllama/runs/00ktlhat[0m
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-12-13 20:39:13,730] [INFO] [axolotl.utils.callbacks.on_train_begin:793] [PID:1121389] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                          | 0/1288 [00:00<?, ?it/s]  0%|                                                                                                                | 1/1288 [00:07<2:44:44,  7.68s/it]  0%|▏                                                                                                               | 2/1288 [00:08<1:24:08,  3.93s/it]  0%|▎                                                                                                                 | 3/1288 [00:10<58:31,  2.73s/it]  0%|▎                                                                                                                 | 4/1288 [00:11<46:16,  2.16s/it]  0%|▍                                                                                                                 | 5/1288 [00:12<39:21,  1.84s/it]  0%|▌                                                                                                                 | 6/1288 [00:14<35:42,  1.67s/it]  1%|▌                                                                                                                 | 7/1288 [00:15<35:09,  1.65s/it]  1%|▋                                                                                                                 | 8/1288 [00:17<35:34,  1.67s/it]  1%|▊                                                                                                                 | 9/1288 [00:18<27:58,  1.31s/it]  1%|▉                                                                                                                | 10/1288 [00:18<22:25,  1.05s/it]                                                                                                                                                        {'loss': 6.8542, 'grad_norm': 4.84375, 'learning_rate': 0.0140625, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2745.36, 'epoch': 0.03}
  1%|▉                                                                                                                | 10/1288 [00:18<22:25,  1.05s/it]  1%|▉                                                                                                                | 11/1288 [00:19<19:07,  1.11it/s]  1%|█                                                                                                                | 12/1288 [00:19<16:17,  1.31it/s]  1%|█▏                                                                                                               | 13/1288 [00:20<14:30,  1.46it/s]  1%|█▏                                                                                                               | 14/1288 [00:20<13:05,  1.62it/s]  1%|█▎                                                                                                               | 15/1288 [00:20<12:22,  1.72it/s]  1%|█▍                                                                                                               | 16/1288 [00:21<13:13,  1.60it/s]  1%|█▍                                                                                                               | 17/1288 [00:22<12:23,  1.71it/s]  1%|█▌                                                                                                               | 18/1288 [00:22<11:38,  1.82it/s]  1%|█▋                                                                                                               | 19/1288 [00:23<11:50,  1.79it/s]  2%|█▊                                                                                                               | 20/1288 [00:23<11:18,  1.87it/s]                                                                                                                                                        {'loss': 6.6978, 'grad_norm': 4.21875, 'learning_rate': 0.029687500000000002, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2828.51, 'epoch': 0.06}
  2%|█▊                                                                                                               | 20/1288 [00:23<11:18,  1.87it/s]  2%|█▊                                                                                                               | 21/1288 [00:24<11:19,  1.87it/s]  2%|█▉                                                                                                               | 22/1288 [00:24<11:22,  1.85it/s]  2%|██                                                                                                               | 23/1288 [00:25<11:23,  1.85it/s]  2%|██                                                                                                               | 24/1288 [00:25<11:19,  1.86it/s]  2%|██▏                                                                                                              | 25/1288 [00:26<10:50,  1.94it/s]  2%|██▎                                                                                                              | 26/1288 [00:26<10:46,  1.95it/s]  2%|██▎                                                                                                              | 27/1288 [00:27<11:03,  1.90it/s]  2%|██▍                                                                                                              | 28/1288 [00:27<10:44,  1.96it/s]  2%|██▌                                                                                                              | 29/1288 [00:28<10:54,  1.92it/s]  2%|██▋                                                                                                              | 30/1288 [00:28<11:04,  1.89it/s]                                                                                                                                                        {'loss': 6.5196, 'grad_norm': 4.75, 'learning_rate': 0.045312500000000006, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2799.53, 'epoch': 0.09}
  2%|██▋                                                                                                              | 30/1288 [00:28<11:04,  1.89it/s]  2%|██▋                                                                                                              | 31/1288 [00:29<10:59,  1.91it/s]  2%|██▊                                                                                                              | 32/1288 [00:29<10:37,  1.97it/s]  3%|██▉                                                                                                              | 33/1288 [00:30<10:45,  1.94it/s]  3%|██▉                                                                                                              | 34/1288 [00:30<10:16,  2.03it/s]  3%|███                                                                                                              | 35/1288 [00:31<10:16,  2.03it/s]  3%|███▏                                                                                                             | 36/1288 [00:31<10:09,  2.05it/s]  3%|███▏                                                                                                             | 37/1288 [00:32<10:17,  2.03it/s]  3%|███▎                                                                                                             | 38/1288 [00:32<10:41,  1.95it/s]  3%|███▍                                                                                                             | 39/1288 [00:33<10:24,  2.00it/s]  3%|███▌                                                                                                             | 40/1288 [00:33<09:20,  2.23it/s]                                                                                                                                                        {'loss': 6.6607, 'grad_norm': 9.4375, 'learning_rate': 0.060937500000000006, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3775.34, 'epoch': 0.12}
  3%|███▌                                                                                                             | 40/1288 [00:33<09:20,  2.23it/s]  3%|███▌                                                                                                             | 41/1288 [00:34<09:30,  2.19it/s]  3%|███▋                                                                                                             | 42/1288 [00:34<09:37,  2.16it/s]  3%|███▊                                                                                                             | 43/1288 [00:35<09:51,  2.10it/s]  3%|███▊                                                                                                             | 44/1288 [00:35<10:01,  2.07it/s]  3%|███▉                                                                                                             | 45/1288 [00:36<10:22,  2.00it/s]  4%|████                                                                                                             | 46/1288 [00:36<10:18,  2.01it/s]  4%|████                                                                                                             | 47/1288 [00:37<10:02,  2.06it/s]  4%|████▏                                                                                                            | 48/1288 [00:37<10:27,  1.98it/s]  4%|████▎                                                                                                            | 49/1288 [00:38<10:39,  1.94it/s]  4%|████▍                                                                                                            | 50/1288 [00:38<10:30,  1.96it/s]                                                                                                                                                        {'loss': 6.8588, 'grad_norm': 5.78125, 'learning_rate': 0.0765625, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2773.42, 'epoch': 0.15}
  4%|████▍                                                                                                            | 50/1288 [00:38<10:30,  1.96it/s]  4%|████▍                                                                                                            | 51/1288 [00:39<10:30,  1.96it/s]  4%|████▌                                                                                                            | 52/1288 [00:39<10:25,  1.98it/s]  4%|████▋                                                                                                            | 53/1288 [00:40<10:25,  1.98it/s]  4%|████▋                                                                                                            | 54/1288 [00:40<10:19,  1.99it/s]  4%|████▊                                                                                                            | 55/1288 [00:41<09:50,  2.09it/s]  4%|████▉                                                                                                            | 56/1288 [00:41<10:09,  2.02it/s]  4%|█████                                                                                                            | 57/1288 [00:42<10:23,  1.97it/s]  5%|█████                                                                                                            | 58/1288 [00:42<10:33,  1.94it/s]  5%|█████▏                                                                                                           | 59/1288 [00:43<09:59,  2.05it/s]  5%|█████▎                                                                                                           | 60/1288 [00:43<10:01,  2.04it/s]                                                                                                                                                        {'loss': 6.6242, 'grad_norm': 4.40625, 'learning_rate': 0.0921875, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2760.99, 'epoch': 0.19}
  5%|█████▎                                                                                                           | 60/1288 [00:43<10:01,  2.04it/s]  5%|█████▎                                                                                                           | 61/1288 [00:44<10:27,  1.95it/s]  5%|█████▍                                                                                                           | 62/1288 [00:44<10:27,  1.95it/s]  5%|█████▌                                                                                                           | 63/1288 [00:45<10:37,  1.92it/s]  5%|█████▌                                                                                                           | 64/1288 [00:45<10:29,  1.94it/s]  5%|█████▋                                                                                                           | 65/1288 [00:46<10:36,  1.92it/s]  5%|█████▊                                                                                                           | 66/1288 [00:46<10:40,  1.91it/s]  5%|█████▉                                                                                                           | 67/1288 [00:47<10:50,  1.88it/s]  5%|█████▉                                                                                                           | 68/1288 [00:48<10:52,  1.87it/s]  5%|██████                                                                                                           | 69/1288 [00:48<10:51,  1.87it/s]  5%|██████▏                                                                                                          | 70/1288 [00:49<10:55,  1.86it/s]                                                                                                                                                        {'loss': 6.3776, 'grad_norm': 5.25, 'learning_rate': 0.09999588271465323, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3077.19, 'epoch': 0.22}
  5%|██████▏                                                                                                          | 70/1288 [00:49<10:55,  1.86it/s]  6%|██████▏                                                                                                          | 71/1288 [00:49<10:18,  1.97it/s]  6%|██████▎                                                                                                          | 72/1288 [00:50<10:14,  1.98it/s]  6%|██████▍                                                                                                          | 73/1288 [00:50<10:21,  1.96it/s]  6%|██████▍                                                                                                          | 74/1288 [00:51<10:22,  1.95it/s]  6%|██████▌                                                                                                          | 75/1288 [00:51<10:15,  1.97it/s]  6%|██████▋                                                                                                          | 76/1288 [00:52<10:02,  2.01it/s]  6%|██████▊                                                                                                          | 77/1288 [00:52<10:18,  1.96it/s]  6%|██████▊                                                                                                          | 78/1288 [00:53<10:05,  2.00it/s]  6%|██████▉                                                                                                          | 79/1288 [00:53<10:20,  1.95it/s]  6%|███████                                                                                                          | 80/1288 [00:54<10:00,  2.01it/s]                                                                                                                                                        {'loss': 6.7895, 'grad_norm': 4.3125, 'learning_rate': 0.09996294850025658, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3021.75, 'epoch': 0.25}
  6%|███████                                                                                                          | 80/1288 [00:54<10:00,  2.01it/s]  6%|███████                                                                                                          | 81/1288 [00:54<10:04,  2.00it/s]  6%|███████▏                                                                                                         | 82/1288 [00:55<10:04,  2.00it/s]  6%|███████▎                                                                                                         | 83/1288 [00:55<10:12,  1.97it/s]  7%|███████▎                                                                                                         | 84/1288 [00:56<09:58,  2.01it/s]  7%|███████▍                                                                                                         | 85/1288 [00:56<09:41,  2.07it/s]  7%|███████▌                                                                                                         | 86/1288 [00:56<09:32,  2.10it/s]  7%|███████▋                                                                                                         | 87/1288 [00:57<09:03,  2.21it/s]  7%|███████▋                                                                                                         | 88/1288 [00:57<09:20,  2.14it/s]  7%|███████▊                                                                                                         | 89/1288 [00:58<09:21,  2.13it/s]  7%|███████▉                                                                                                         | 90/1288 [00:58<09:20,  2.14it/s]                                                                                                                                                        {'loss': 6.6384, 'grad_norm': 3.984375, 'learning_rate': 0.09989710176649935, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2663.98, 'epoch': 0.28}
  7%|███████▉                                                                                                         | 90/1288 [00:58<09:20,  2.14it/s]  7%|███████▉                                                                                                         | 91/1288 [00:59<08:56,  2.23it/s]  7%|████████                                                                                                         | 92/1288 [00:59<08:59,  2.22it/s]  7%|████████▏                                                                                                        | 93/1288 [01:00<09:28,  2.10it/s]  7%|████████▏                                                                                                        | 94/1288 [01:00<09:21,  2.12it/s]  7%|████████▎                                                                                                        | 95/1288 [01:01<09:18,  2.14it/s]  7%|████████▍                                                                                                        | 96/1288 [01:01<09:41,  2.05it/s]  8%|████████▌                                                                                                        | 97/1288 [01:02<09:47,  2.03it/s]  8%|████████▌                                                                                                        | 98/1288 [01:02<10:47,  1.84it/s]  8%|████████▋                                                                                                        | 99/1288 [01:03<10:40,  1.86it/s]  8%|████████▋                                                                                                       | 100/1288 [01:03<10:26,  1.90it/s]                                                                                                                                                        {'loss': 6.7182, 'grad_norm': 4.125, 'learning_rate': 0.09979838588916229, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2654.54, 'epoch': 0.31}
  8%|████████▋                                                                                                       | 100/1288 [01:03<10:26,  1.90it/s][2025-12-13 20:40:17,596] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-100
  8%|████████▊                                                                                                       | 101/1288 [01:06<23:48,  1.20s/it]  8%|████████▊                                                                                                       | 102/1288 [01:07<19:38,  1.01it/s]  8%|████████▉                                                                                                       | 103/1288 [01:07<16:55,  1.17it/s]  8%|█████████                                                                                                       | 104/1288 [01:08<14:46,  1.33it/s]  8%|█████████▏                                                                                                      | 105/1288 [01:08<13:07,  1.50it/s]  8%|█████████▏                                                                                                      | 106/1288 [01:09<12:01,  1.64it/s]  8%|█████████▎                                                                                                      | 107/1288 [01:09<11:24,  1.72it/s]  8%|█████████▍                                                                                                      | 108/1288 [01:10<10:54,  1.80it/s]  8%|█████████▍                                                                                                      | 109/1288 [01:10<10:48,  1.82it/s]  9%|█████████▌                                                                                                      | 110/1288 [01:11<10:28,  1.87it/s]                                                                                                                                                        {'loss': 6.626, 'grad_norm': 3.921875, 'learning_rate': 0.0996668658961975, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2783.22, 'epoch': 0.34}
  9%|█████████▌                                                                                                      | 110/1288 [01:11<10:28,  1.87it/s]  9%|█████████▋                                                                                                      | 111/1288 [01:11<10:08,  1.94it/s]  9%|█████████▋                                                                                                      | 112/1288 [01:12<09:49,  2.00it/s]  9%|█████████▊                                                                                                      | 113/1288 [01:12<09:51,  1.99it/s]  9%|█████████▉                                                                                                      | 114/1288 [01:13<10:00,  1.95it/s]  9%|██████████                                                                                                      | 115/1288 [01:13<09:55,  1.97it/s]  9%|██████████                                                                                                      | 116/1288 [01:14<09:41,  2.01it/s]  9%|██████████▏                                                                                                     | 117/1288 [01:14<10:05,  1.93it/s]  9%|██████████▎                                                                                                     | 118/1288 [01:15<09:50,  1.98it/s]  9%|██████████▎                                                                                                     | 119/1288 [01:15<09:48,  1.99it/s]  9%|██████████▍                                                                                                     | 120/1288 [01:16<09:36,  2.03it/s]                                                                                                                                                        {'loss': 6.3855, 'grad_norm': 3.828125, 'learning_rate': 0.09950262842489216, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2989.57, 'epoch': 0.37}
  9%|██████████▍                                                                                                     | 120/1288 [01:16<09:36,  2.03it/s]  9%|██████████▌                                                                                                     | 121/1288 [01:16<09:38,  2.02it/s]  9%|██████████▌                                                                                                     | 122/1288 [01:17<09:41,  2.00it/s] 10%|██████████▋                                                                                                     | 123/1288 [01:17<10:25,  1.86it/s] 10%|██████████▊                                                                                                     | 124/1288 [01:18<10:17,  1.88it/s] 10%|██████████▊                                                                                                     | 125/1288 [01:18<10:16,  1.89it/s] 10%|██████████▉                                                                                                     | 126/1288 [01:19<10:03,  1.92it/s] 10%|███████████                                                                                                     | 127/1288 [01:19<09:46,  1.98it/s] 10%|███████████▏                                                                                                    | 128/1288 [01:20<09:43,  1.99it/s] 10%|███████████▏                                                                                                    | 129/1288 [01:20<09:55,  1.95it/s] 10%|███████████▎                                                                                                    | 130/1288 [01:21<09:39,  2.00it/s]                                                                                                                                                        {'loss': 6.4989, 'grad_norm': 3.140625, 'learning_rate': 0.099305781664797, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2968.88, 'epoch': 0.4}
 10%|███████████▎                                                                                                    | 130/1288 [01:21<09:39,  2.00it/s] 10%|███████████▍                                                                                                    | 131/1288 [01:21<09:26,  2.04it/s] 10%|███████████▍                                                                                                    | 132/1288 [01:22<09:20,  2.06it/s] 10%|███████████▌                                                                                                    | 133/1288 [01:22<09:37,  2.00it/s] 10%|███████████▋                                                                                                    | 134/1288 [01:23<09:45,  1.97it/s] 10%|███████████▋                                                                                                    | 135/1288 [01:23<09:55,  1.94it/s] 11%|███████████▊                                                                                                    | 136/1288 [01:24<09:54,  1.94it/s] 11%|███████████▉                                                                                                    | 137/1288 [01:24<10:02,  1.91it/s] 11%|████████████                                                                                                    | 138/1288 [01:25<09:55,  1.93it/s] 11%|████████████                                                                                                    | 139/1288 [01:25<09:54,  1.93it/s] 11%|████████████▏                                                                                                   | 140/1288 [01:26<10:03,  1.90it/s]                                                                                                                                                        {'loss': 6.2947, 'grad_norm': 3.578125, 'learning_rate': 0.0990764552864579, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2775.85, 'epoch': 0.43}
 11%|████████████▏                                                                                                   | 140/1288 [01:26<10:03,  1.90it/s] 11%|████████████▎                                                                                                   | 141/1288 [01:26<09:44,  1.96it/s] 11%|████████████▎                                                                                                   | 142/1288 [01:27<09:52,  1.94it/s] 11%|████████████▍                                                                                                   | 143/1288 [01:27<09:56,  1.92it/s] 11%|████████████▌                                                                                                   | 144/1288 [01:28<09:59,  1.91it/s] 11%|████████████▌                                                                                                   | 145/1288 [01:28<09:39,  1.97it/s] 11%|████████████▋                                                                                                   | 146/1288 [01:29<09:11,  2.07it/s] 11%|████████████▊                                                                                                   | 147/1288 [01:29<09:20,  2.04it/s] 11%|████████████▊                                                                                                   | 148/1288 [01:30<09:24,  2.02it/s] 12%|████████████▉                                                                                                   | 149/1288 [01:30<09:23,  2.02it/s] 12%|█████████████                                                                                                   | 150/1288 [01:31<09:24,  2.02it/s]                                                                                                                                                        {'loss': 6.6597, 'grad_norm': 3.375, 'learning_rate': 0.09881480035599667, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2704.94, 'epoch': 0.46}
 12%|█████████████                                                                                                   | 150/1288 [01:31<09:24,  2.02it/s] 12%|█████████████▏                                                                                                  | 151/1288 [01:31<09:28,  2.00it/s] 12%|█████████████▏                                                                                                  | 152/1288 [01:32<09:27,  2.00it/s] 12%|█████████████▎                                                                                                  | 153/1288 [01:32<09:20,  2.03it/s] 12%|█████████████▍                                                                                                  | 154/1288 [01:33<09:20,  2.02it/s] 12%|█████████████▍                                                                                                  | 155/1288 [01:33<09:13,  2.05it/s] 12%|█████████████▌                                                                                                  | 156/1288 [01:34<09:18,  2.03it/s] 12%|█████████████▋                                                                                                  | 157/1288 [01:34<09:24,  2.00it/s] 12%|█████████████▋                                                                                                  | 158/1288 [01:35<09:40,  1.95it/s] 12%|█████████████▊                                                                                                  | 159/1288 [01:35<09:47,  1.92it/s] 12%|█████████████▉                                                                                                  | 160/1288 [01:36<09:31,  1.97it/s]                                                                                                                                                        {'loss': 6.4151, 'grad_norm': 3.9375, 'learning_rate': 0.09852098923559818, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2997.44, 'epoch': 0.5}
 12%|█████████████▉                                                                                                  | 160/1288 [01:36<09:31,  1.97it/s] 12%|██████████████                                                                                                  | 161/1288 [01:36<09:16,  2.02it/s] 13%|██████████████                                                                                                  | 162/1288 [01:37<09:18,  2.02it/s] 13%|██████████████▏                                                                                                 | 163/1288 [01:37<09:41,  1.94it/s] 13%|██████████████▎                                                                                                 | 164/1288 [01:38<09:34,  1.96it/s] 13%|██████████████▎                                                                                                 | 165/1288 [01:38<09:33,  1.96it/s] 13%|██████████████▍                                                                                                 | 166/1288 [01:39<09:28,  1.97it/s] 13%|██████████████▌                                                                                                 | 167/1288 [01:39<09:28,  1.97it/s] 13%|██████████████▌                                                                                                 | 168/1288 [01:40<09:15,  2.02it/s] 13%|██████████████▋                                                                                                 | 169/1288 [01:40<09:09,  2.04it/s] 13%|██████████████▊                                                                                                 | 170/1288 [01:41<09:01,  2.06it/s]                                                                                                                                                        {'loss': 6.7421, 'grad_norm': 4.0, 'learning_rate': 0.09819521546996864, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2851.12, 'epoch': 0.53}
 13%|██████████████▊                                                                                                 | 170/1288 [01:41<09:01,  2.06it/s] 13%|██████████████▊                                                                                                 | 171/1288 [01:41<09:19,  2.00it/s] 13%|██████████████▉                                                                                                 | 172/1288 [01:42<09:16,  2.00it/s] 13%|███████████████                                                                                                 | 173/1288 [01:42<09:26,  1.97it/s] 14%|███████████████▏                                                                                                | 174/1288 [01:43<09:13,  2.01it/s] 14%|███████████████▏                                                                                                | 175/1288 [01:43<09:17,  2.00it/s] 14%|███████████████▎                                                                                                | 176/1288 [01:44<09:31,  1.95it/s] 14%|███████████████▍                                                                                                | 177/1288 [01:44<09:15,  2.00it/s] 14%|███████████████▍                                                                                                | 178/1288 [01:45<09:20,  1.98it/s] 14%|███████████████▌                                                                                                | 179/1288 [01:46<09:24,  1.96it/s] 14%|███████████████▋                                                                                                | 180/1288 [01:46<09:11,  2.01it/s]                                                                                                                                                        {'loss': 6.3521, 'grad_norm': 3.1875, 'learning_rate': 0.09783769365884022, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3006.04, 'epoch': 0.56}
 14%|███████████████▋                                                                                                | 180/1288 [01:46<09:11,  2.01it/s] 14%|███████████████▋                                                                                                | 181/1288 [01:47<09:25,  1.96it/s] 14%|███████████████▊                                                                                                | 182/1288 [01:47<09:24,  1.96it/s] 14%|███████████████▉                                                                                                | 183/1288 [01:47<08:59,  2.05it/s] 14%|████████████████                                                                                                | 184/1288 [01:48<08:58,  2.05it/s] 14%|████████████████                                                                                                | 185/1288 [01:48<08:48,  2.09it/s] 14%|████████████████▏                                                                                               | 186/1288 [01:49<08:59,  2.04it/s] 15%|████████████████▎                                                                                               | 187/1288 [01:49<09:09,  2.00it/s] 15%|████████████████▎                                                                                               | 188/1288 [01:50<09:11,  1.99it/s] 15%|████████████████▍                                                                                               | 189/1288 [01:50<09:14,  1.98it/s] 15%|████████████████▌                                                                                               | 190/1288 [01:51<09:14,  1.98it/s]                                                                                                                                                        {'loss': 6.7043, 'grad_norm': 4.90625, 'learning_rate': 0.09744865931560606, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2724.09, 'epoch': 0.59}
 15%|████████████████▌                                                                                               | 190/1288 [01:51<09:14,  1.98it/s] 15%|████████████████▌                                                                                               | 191/1288 [01:52<09:35,  1.91it/s] 15%|████████████████▋                                                                                               | 192/1288 [01:52<09:09,  2.00it/s] 15%|████████████████▊                                                                                               | 193/1288 [01:53<09:21,  1.95it/s] 15%|████████████████▊                                                                                               | 194/1288 [01:53<09:11,  1.98it/s] 15%|████████████████▉                                                                                               | 195/1288 [01:54<09:21,  1.95it/s] 15%|█████████████████                                                                                               | 196/1288 [01:54<09:10,  1.98it/s] 15%|█████████████████▏                                                                                              | 197/1288 [01:55<09:11,  1.98it/s] 15%|█████████████████▏                                                                                              | 198/1288 [01:55<09:21,  1.94it/s] 15%|█████████████████▎                                                                                              | 199/1288 [01:56<09:07,  1.99it/s] 16%|█████████████████▍                                                                                              | 200/1288 [01:56<09:07,  1.99it/s]                                                                                                                                                        {'loss': 6.4776, 'grad_norm': 3.265625, 'learning_rate': 0.09702836871217839, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2788.26, 'epoch': 0.62}
 16%|█████████████████▍                                                                                              | 200/1288 [01:56<09:07,  1.99it/s][2025-12-13 20:41:10,285] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-200
 16%|█████████████████▍                                                                                              | 201/1288 [01:59<21:40,  1.20s/it] 16%|█████████████████▌                                                                                              | 202/1288 [01:59<17:44,  1.02it/s] 16%|█████████████████▋                                                                                              | 203/1288 [02:00<15:00,  1.20it/s] 16%|█████████████████▋                                                                                              | 204/1288 [02:00<13:14,  1.36it/s] 16%|█████████████████▊                                                                                              | 205/1288 [02:01<11:48,  1.53it/s] 16%|█████████████████▉                                                                                              | 206/1288 [02:01<11:14,  1.60it/s] 16%|██████████████████                                                                                              | 207/1288 [02:02<10:46,  1.67it/s] 16%|██████████████████                                                                                              | 208/1288 [02:02<10:05,  1.78it/s] 16%|██████████████████▏                                                                                             | 209/1288 [02:03<09:35,  1.87it/s] 16%|██████████████████▎                                                                                             | 210/1288 [02:03<09:46,  1.84it/s]                                                                                                                                                        {'loss': 6.4896, 'grad_norm': 3.265625, 'learning_rate': 0.09657709871017242, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2517.3, 'epoch': 0.65}
 16%|██████████████████▎                                                                                             | 210/1288 [02:03<09:46,  1.84it/s] 16%|██████████████████▎                                                                                             | 211/1288 [02:04<09:22,  1.91it/s] 16%|██████████████████▍                                                                                             | 212/1288 [02:04<09:04,  1.97it/s] 17%|██████████████████▌                                                                                             | 213/1288 [02:05<09:06,  1.97it/s] 17%|██████████████████▌                                                                                             | 214/1288 [02:05<09:15,  1.93it/s] 17%|██████████████████▋                                                                                             | 215/1288 [02:06<09:13,  1.94it/s] 17%|██████████████████▊                                                                                             | 216/1288 [02:06<09:22,  1.91it/s] 17%|██████████████████▊                                                                                             | 217/1288 [02:07<09:03,  1.97it/s] 17%|██████████████████▉                                                                                             | 218/1288 [02:07<09:13,  1.93it/s] 17%|███████████████████                                                                                             | 219/1288 [02:08<09:06,  1.96it/s] 17%|███████████████████▏                                                                                            | 220/1288 [02:08<09:03,  1.97it/s]                                                                                                                                                        {'loss': 6.4361, 'grad_norm': 3.921875, 'learning_rate': 0.0960951465785269, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2767.27, 'epoch': 0.68}
 17%|███████████████████▏                                                                                            | 220/1288 [02:08<09:03,  1.97it/s] 17%|███████████████████▏                                                                                            | 221/1288 [02:09<08:58,  1.98it/s] 17%|███████████████████▎                                                                                            | 222/1288 [02:09<08:57,  1.98it/s] 17%|███████████████████▍                                                                                            | 223/1288 [02:10<08:55,  1.99it/s] 17%|███████████████████▍                                                                                            | 224/1288 [02:10<08:42,  2.04it/s] 17%|███████████████████▌                                                                                            | 225/1288 [02:11<08:55,  1.98it/s] 18%|███████████████████▋                                                                                            | 226/1288 [02:11<08:45,  2.02it/s] 18%|███████████████████▋                                                                                            | 227/1288 [02:12<08:41,  2.04it/s] 18%|███████████████████▊                                                                                            | 228/1288 [02:12<08:36,  2.05it/s] 18%|███████████████████▉                                                                                            | 229/1288 [02:13<08:32,  2.07it/s] 18%|████████████████████                                                                                            | 230/1288 [02:13<08:28,  2.08it/s]                                                                                                                                                        {'loss': 6.7116, 'grad_norm': 3.390625, 'learning_rate': 0.09558282979768164, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2773.87, 'epoch': 0.71}
 18%|████████████████████                                                                                            | 230/1288 [02:13<08:28,  2.08it/s] 18%|████████████████████                                                                                            | 231/1288 [02:14<08:27,  2.08it/s] 18%|████████████████████▏                                                                                           | 232/1288 [02:14<08:35,  2.05it/s] 18%|████████████████████▎                                                                                           | 233/1288 [02:15<08:43,  2.02it/s] 18%|████████████████████▎                                                                                           | 234/1288 [02:15<08:48,  1.99it/s] 18%|████████████████████▍                                                                                           | 235/1288 [02:16<09:02,  1.94it/s] 18%|████████████████████▌                                                                                           | 236/1288 [02:16<08:58,  1.95it/s] 18%|████████████████████▌                                                                                           | 237/1288 [02:17<08:53,  1.97it/s] 18%|████████████████████▋                                                                                           | 238/1288 [02:17<09:00,  1.94it/s] 19%|████████████████████▊                                                                                           | 239/1288 [02:18<08:47,  1.99it/s] 19%|████████████████████▊                                                                                           | 240/1288 [02:18<08:36,  2.03it/s]                                                                                                                                                        {'loss': 6.5845, 'grad_norm': 3.796875, 'learning_rate': 0.09504048585044089, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3004.89, 'epoch': 0.74}
 19%|████████████████████▊                                                                                           | 240/1288 [02:18<08:36,  2.03it/s] 19%|████████████████████▉                                                                                           | 241/1288 [02:19<08:29,  2.06it/s] 19%|█████████████████████                                                                                           | 242/1288 [02:19<08:24,  2.07it/s] 19%|█████████████████████▏                                                                                          | 243/1288 [02:20<08:32,  2.04it/s] 19%|█████████████████████▏                                                                                          | 244/1288 [02:20<08:26,  2.06it/s] 19%|█████████████████████▎                                                                                          | 245/1288 [02:21<08:42,  2.00it/s] 19%|█████████████████████▍                                                                                          | 246/1288 [02:21<08:45,  1.98it/s] 19%|█████████████████████▍                                                                                          | 247/1288 [02:22<08:44,  1.98it/s] 19%|█████████████████████▌                                                                                          | 248/1288 [02:22<08:34,  2.02it/s] 19%|█████████████████████▋                                                                                          | 249/1288 [02:23<08:25,  2.06it/s] 19%|█████████████████████▋                                                                                          | 250/1288 [02:23<08:29,  2.04it/s]                                                                                                                                                        {'loss': 6.5014, 'grad_norm': 3.453125, 'learning_rate': 0.09446847199966041, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2717.09, 'epoch': 0.77}
 19%|█████████████████████▋                                                                                          | 250/1288 [02:23<08:29,  2.04it/s] 19%|█████████████████████▊                                                                                          | 251/1288 [02:24<08:22,  2.06it/s] 20%|█████████████████████▉                                                                                          | 252/1288 [02:24<08:40,  1.99it/s] 20%|██████████████████████                                                                                          | 253/1288 [02:25<08:33,  2.02it/s] 20%|██████████████████████                                                                                          | 254/1288 [02:25<08:35,  2.01it/s] 20%|██████████████████████▏                                                                                         | 255/1288 [02:26<08:49,  1.95it/s] 20%|██████████████████████▎                                                                                         | 256/1288 [02:26<08:45,  1.96it/s] 20%|██████████████████████▎                                                                                         | 257/1288 [02:27<08:32,  2.01it/s] 20%|██████████████████████▍                                                                                         | 258/1288 [02:27<08:40,  1.98it/s] 20%|██████████████████████▌                                                                                         | 259/1288 [02:28<08:28,  2.02it/s] 20%|██████████████████████▌                                                                                         | 260/1288 [02:28<08:31,  2.01it/s]                                                                                                                                                        {'loss': 6.5148, 'grad_norm': 3.21875, 'learning_rate': 0.09386716505290467, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2794.02, 'epoch': 0.8}
 20%|██████████████████████▌                                                                                         | 260/1288 [02:28<08:31,  2.01it/s] 20%|██████████████████████▋                                                                                         | 261/1288 [02:29<08:32,  2.01it/s] 20%|██████████████████████▊                                                                                         | 262/1288 [02:29<08:42,  1.97it/s] 20%|██████████████████████▊                                                                                         | 263/1288 [02:30<08:41,  1.96it/s] 20%|██████████████████████▉                                                                                         | 264/1288 [02:30<08:39,  1.97it/s] 21%|███████████████████████                                                                                         | 265/1288 [02:31<08:28,  2.01it/s] 21%|███████████████████████▏                                                                                        | 266/1288 [02:31<08:16,  2.06it/s] 21%|███████████████████████▏                                                                                        | 267/1288 [02:32<08:34,  1.99it/s] 21%|███████████████████████▎                                                                                        | 268/1288 [02:32<08:33,  1.99it/s] 21%|███████████████████████▍                                                                                        | 269/1288 [02:33<08:33,  1.99it/s] 21%|███████████████████████▍                                                                                        | 270/1288 [02:33<08:34,  1.98it/s]                                                                                                                                                        {'loss': 6.5165, 'grad_norm': 3.984375, 'learning_rate': 0.09323696111422922, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2794.22, 'epoch': 0.84}
 21%|███████████████████████▍                                                                                        | 270/1288 [02:33<08:34,  1.98it/s] 21%|███████████████████████▌                                                                                        | 271/1288 [02:34<08:01,  2.11it/s] 21%|███████████████████████▋                                                                                        | 272/1288 [02:34<07:49,  2.16it/s] 21%|███████████████████████▋                                                                                        | 273/1288 [02:35<08:12,  2.06it/s] 21%|███████████████████████▊                                                                                        | 274/1288 [02:35<08:13,  2.05it/s] 21%|███████████████████████▉                                                                                        | 275/1288 [02:36<08:30,  1.99it/s] 21%|████████████████████████                                                                                        | 276/1288 [02:36<08:29,  1.99it/s] 22%|████████████████████████                                                                                        | 277/1288 [02:37<08:14,  2.05it/s] 22%|████████████████████████▏                                                                                       | 278/1288 [02:37<08:14,  2.04it/s] 22%|████████████████████████▎                                                                                       | 279/1288 [02:38<08:26,  1.99it/s] 22%|████████████████████████▎                                                                                       | 280/1288 [02:38<08:49,  1.91it/s]                                                                                                                                                        {'loss': 6.6842, 'grad_norm': 3.640625, 'learning_rate': 0.09257827532325158, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2621.14, 'epoch': 0.87}
 22%|████████████████████████▎                                                                                       | 280/1288 [02:38<08:49,  1.91it/s] 22%|████████████████████████▍                                                                                       | 281/1288 [02:39<08:44,  1.92it/s] 22%|████████████████████████▌                                                                                       | 282/1288 [02:39<08:45,  1.91it/s] 22%|████████████████████████▌                                                                                       | 283/1288 [02:40<08:42,  1.92it/s] 22%|████████████████████████▋                                                                                       | 284/1288 [02:40<08:29,  1.97it/s] 22%|████████████████████████▊                                                                                       | 285/1288 [02:41<08:36,  1.94it/s] 22%|████████████████████████▊                                                                                       | 286/1288 [02:41<08:43,  1.91it/s] 22%|████████████████████████▉                                                                                       | 287/1288 [02:42<08:39,  1.93it/s] 22%|█████████████████████████                                                                                       | 288/1288 [02:42<08:21,  1.99it/s] 22%|█████████████████████████▏                                                                                      | 289/1288 [02:43<08:13,  2.02it/s] 23%|█████████████████████████▏                                                                                      | 290/1288 [02:43<08:17,  2.01it/s]                                                                                                                                                        {'loss': 6.4367, 'grad_norm': 3.109375, 'learning_rate': 0.09189154158168292, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2840.79, 'epoch': 0.9}
 23%|█████████████████████████▏                                                                                      | 290/1288 [02:43<08:17,  2.01it/s] 23%|█████████████████████████▎                                                                                      | 291/1288 [02:44<08:16,  2.01it/s] 23%|█████████████████████████▍                                                                                      | 292/1288 [02:44<08:06,  2.05it/s] 23%|█████████████████████████▍                                                                                      | 293/1288 [02:45<08:06,  2.04it/s] 23%|█████████████████████████▌                                                                                      | 294/1288 [02:45<08:09,  2.03it/s] 23%|█████████████████████████▋                                                                                      | 295/1288 [02:46<08:22,  1.97it/s] 23%|█████████████████████████▋                                                                                      | 296/1288 [02:46<08:19,  1.98it/s] 23%|█████████████████████████▊                                                                                      | 297/1288 [02:47<08:28,  1.95it/s] 23%|█████████████████████████▉                                                                                      | 298/1288 [02:47<08:19,  1.98it/s] 23%|██████████████████████████                                                                                      | 299/1288 [02:48<07:54,  2.09it/s] 23%|██████████████████████████                                                                                      | 300/1288 [02:48<08:03,  2.04it/s]                                                                                                                                                        {'loss': 6.5053, 'grad_norm': 3.6875, 'learning_rate': 0.0911772122675002, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2634.1, 'epoch': 0.93}
 23%|██████████████████████████                                                                                      | 300/1288 [02:48<08:03,  2.04it/s][2025-12-13 20:42:02,557] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-300
 23%|██████████████████████████▏                                                                                     | 301/1288 [02:51<20:04,  1.22s/it] 23%|██████████████████████████▎                                                                                     | 302/1288 [02:52<16:17,  1.01it/s] 24%|██████████████████████████▎                                                                                     | 303/1288 [02:52<14:00,  1.17it/s] 24%|██████████████████████████▍                                                                                     | 304/1288 [02:53<12:27,  1.32it/s] 24%|██████████████████████████▌                                                                                     | 305/1288 [02:53<11:11,  1.46it/s] 24%|██████████████████████████▌                                                                                     | 306/1288 [02:54<10:26,  1.57it/s] 24%|██████████████████████████▋                                                                                     | 307/1288 [02:54<09:05,  1.80it/s] 24%|██████████████████████████▊                                                                                     | 308/1288 [02:55<08:48,  1.85it/s] 24%|██████████████████████████▊                                                                                     | 309/1288 [02:55<08:46,  1.86it/s] 24%|██████████████████████████▉                                                                                     | 310/1288 [02:56<08:42,  1.87it/s]                                                                                                                                                        {'loss': 6.4672, 'grad_norm': 3.171875, 'learning_rate': 0.09043575793694733, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2600.45, 'epoch': 0.96}
 24%|██████████████████████████▉                                                                                     | 310/1288 [02:56<08:42,  1.87it/s] 24%|███████████████████████████                                                                                     | 311/1288 [02:56<08:24,  1.94it/s] 24%|███████████████████████████▏                                                                                    | 312/1288 [02:57<08:28,  1.92it/s] 24%|███████████████████████████▏                                                                                    | 313/1288 [02:57<08:31,  1.91it/s] 24%|███████████████████████████▎                                                                                    | 314/1288 [02:58<08:23,  1.94it/s] 24%|███████████████████████████▍                                                                                    | 315/1288 [02:58<08:07,  2.00it/s] 25%|███████████████████████████▍                                                                                    | 316/1288 [02:59<07:55,  2.04it/s] 25%|███████████████████████████▌                                                                                    | 317/1288 [02:59<07:55,  2.04it/s] 25%|███████████████████████████▋                                                                                    | 318/1288 [03:00<07:48,  2.07it/s] 25%|███████████████████████████▋                                                                                    | 319/1288 [03:00<07:43,  2.09it/s] 25%|███████████████████████████▊                                                                                    | 320/1288 [03:01<07:31,  2.14it/s]                                                                                                                                                        {'loss': 6.4323, 'grad_norm': 3.625, 'learning_rate': 0.08966766701456176, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2831.85, 'epoch': 0.99}
 25%|███████████████████████████▊                                                                                    | 320/1288 [03:01<07:31,  2.14it/s] 25%|███████████████████████████▉                                                                                    | 321/1288 [03:01<07:34,  2.13it/s] 25%|████████████████████████████                                                                                    | 322/1288 [03:02<07:35,  2.12it/s] 25%|████████████████████████████                                                                                    | 323/1288 [03:02<07:43,  2.08it/s] 25%|████████████████████████████▏                                                                                   | 324/1288 [03:04<15:00,  1.07it/s] 25%|████████████████████████████▎                                                                                   | 325/1288 [03:05<12:56,  1.24it/s] 25%|████████████████████████████▎                                                                                   | 326/1288 [03:05<11:38,  1.38it/s] 25%|████████████████████████████▍                                                                                   | 327/1288 [03:06<10:32,  1.52it/s] 25%|████████████████████████████▌                                                                                   | 328/1288 [03:06<09:55,  1.61it/s] 26%|████████████████████████████▌                                                                                   | 329/1288 [03:07<09:12,  1.74it/s] 26%|████████████████████████████▋                                                                                   | 330/1288 [03:07<08:39,  1.84it/s]                                                                                                                                                        {'loss': 6.4867, 'grad_norm': 4.0, 'learning_rate': 0.08887344547143032, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2989.61, 'epoch': 1.02}
 26%|████████████████████████████▋                                                                                   | 330/1288 [03:07<08:39,  1.84it/s] 26%|████████████████████████████▊                                                                                   | 331/1288 [03:07<08:15,  1.93it/s] 26%|████████████████████████████▊                                                                                   | 332/1288 [03:08<08:12,  1.94it/s] 26%|████████████████████████████▉                                                                                   | 333/1288 [03:08<07:59,  1.99it/s] 26%|█████████████████████████████                                                                                   | 334/1288 [03:09<08:09,  1.95it/s] 26%|█████████████████████████████▏                                                                                  | 335/1288 [03:10<08:15,  1.92it/s] 26%|█████████████████████████████▏                                                                                  | 336/1288 [03:10<07:48,  2.03it/s] 26%|█████████████████████████████▎                                                                                  | 337/1288 [03:10<07:39,  2.07it/s] 26%|█████████████████████████████▍                                                                                  | 338/1288 [03:11<07:43,  2.05it/s] 26%|█████████████████████████████▍                                                                                  | 339/1288 [03:11<07:46,  2.03it/s] 26%|█████████████████████████████▌                                                                                  | 340/1288 [03:12<07:44,  2.04it/s]                                                                                                                                                        {'loss': 6.3927, 'grad_norm': 3.9375, 'learning_rate': 0.08805361649188657, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2768.75, 'epoch': 1.05}
 26%|█████████████████████████████▌                                                                                  | 340/1288 [03:12<07:44,  2.04it/s] 26%|█████████████████████████████▋                                                                                  | 341/1288 [03:12<07:45,  2.03it/s] 27%|█████████████████████████████▋                                                                                  | 342/1288 [03:13<07:48,  2.02it/s] 27%|█████████████████████████████▊                                                                                  | 343/1288 [03:13<07:53,  1.99it/s] 27%|█████████████████████████████▉                                                                                  | 344/1288 [03:14<07:56,  1.98it/s] 27%|██████████████████████████████                                                                                  | 345/1288 [03:14<07:48,  2.01it/s] 27%|██████████████████████████████                                                                                  | 346/1288 [03:15<07:42,  2.04it/s] 27%|██████████████████████████████▏                                                                                 | 347/1288 [03:15<07:54,  1.98it/s] 27%|██████████████████████████████▎                                                                                 | 348/1288 [03:16<07:43,  2.03it/s] 27%|██████████████████████████████▎                                                                                 | 349/1288 [03:16<07:48,  2.01it/s] 27%|██████████████████████████████▍                                                                                 | 350/1288 [03:17<07:48,  2.00it/s]                                                                                                                                                        {'loss': 6.4186, 'grad_norm': 3.28125, 'learning_rate': 0.08720872012886918, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2754.62, 'epoch': 1.08}
 27%|██████████████████████████████▍                                                                                 | 350/1288 [03:17<07:48,  2.00it/s] 27%|██████████████████████████████▌                                                                                 | 351/1288 [03:17<07:49,  1.99it/s] 27%|██████████████████████████████▌                                                                                 | 352/1288 [03:18<07:50,  1.99it/s] 27%|██████████████████████████████▋                                                                                 | 353/1288 [03:18<07:53,  1.97it/s] 27%|██████████████████████████████▊                                                                                 | 354/1288 [03:19<07:45,  2.01it/s] 28%|██████████████████████████████▊                                                                                 | 355/1288 [03:19<07:38,  2.03it/s] 28%|██████████████████████████████▉                                                                                 | 356/1288 [03:20<07:53,  1.97it/s] 28%|███████████████████████████████                                                                                 | 357/1288 [03:20<07:53,  1.96it/s] 28%|███████████████████████████████▏                                                                                | 358/1288 [03:21<08:03,  1.92it/s] 28%|███████████████████████████████▏                                                                                | 359/1288 [03:22<08:14,  1.88it/s] 28%|███████████████████████████████▎                                                                                | 360/1288 [03:22<07:57,  1.95it/s]                                                                                                                                                        {'loss': 6.371, 'grad_norm': 3.59375, 'learning_rate': 0.08633931294816821, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3122.51, 'epoch': 1.11}
 28%|███████████████████████████████▎                                                                                | 360/1288 [03:22<07:57,  1.95it/s] 28%|███████████████████████████████▍                                                                                | 361/1288 [03:22<07:24,  2.08it/s] 28%|███████████████████████████████▍                                                                                | 362/1288 [03:23<07:40,  2.01it/s] 28%|███████████████████████████████▌                                                                                | 363/1288 [03:23<07:44,  1.99it/s] 28%|███████████████████████████████▋                                                                                | 364/1288 [03:24<07:36,  2.02it/s] 28%|███████████████████████████████▋                                                                                | 365/1288 [03:24<07:32,  2.04it/s] 28%|███████████████████████████████▊                                                                                | 366/1288 [03:25<07:34,  2.03it/s] 28%|███████████████████████████████▉                                                                                | 367/1288 [03:25<07:29,  2.05it/s] 29%|████████████████████████████████                                                                                | 368/1288 [03:26<07:43,  1.98it/s] 29%|████████████████████████████████                                                                                | 369/1288 [03:26<07:34,  2.02it/s] 29%|████████████████████████████████▏                                                                               | 370/1288 [03:27<07:26,  2.06it/s]                                                                                                                                                        {'loss': 6.4765, 'grad_norm': 3.125, 'learning_rate': 0.08544596766179377, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2779.89, 'epoch': 1.15}
 29%|████████████████████████████████▏                                                                               | 370/1288 [03:27<07:26,  2.06it/s] 29%|████████████████████████████████▎                                                                               | 371/1288 [03:27<07:40,  1.99it/s] 29%|████████████████████████████████▎                                                                               | 372/1288 [03:28<07:52,  1.94it/s] 29%|████████████████████████████████▍                                                                               | 373/1288 [03:28<07:49,  1.95it/s] 29%|████████████████████████████████▌                                                                               | 374/1288 [03:29<07:31,  2.02it/s] 29%|████████████████████████████████▌                                                                               | 375/1288 [03:29<07:34,  2.01it/s] 29%|████████████████████████████████▋                                                                               | 376/1288 [03:30<07:26,  2.04it/s] 29%|████████████████████████████████▊                                                                               | 377/1288 [03:30<07:42,  1.97it/s] 29%|████████████████████████████████▊                                                                               | 378/1288 [03:31<07:40,  1.98it/s] 29%|████████████████████████████████▉                                                                               | 379/1288 [03:31<07:31,  2.02it/s] 30%|█████████████████████████████████                                                                               | 380/1288 [03:32<07:28,  2.02it/s]                                                                                                                                                        {'loss': 6.5867, 'grad_norm': 3.953125, 'learning_rate': 0.08452927275070858, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2866.01, 'epoch': 1.18}
 30%|█████████████████████████████████                                                                               | 380/1288 [03:32<07:28,  2.02it/s] 30%|█████████████████████████████████▏                                                                              | 381/1288 [03:32<07:42,  1.96it/s] 30%|█████████████████████████████████▏                                                                              | 382/1288 [03:33<07:33,  2.00it/s] 30%|█████████████████████████████████▎                                                                              | 383/1288 [03:33<07:24,  2.04it/s] 30%|█████████████████████████████████▍                                                                              | 384/1288 [03:34<07:34,  1.99it/s] 30%|█████████████████████████████████▍                                                                              | 385/1288 [03:34<07:40,  1.96it/s] 30%|█████████████████████████████████▌                                                                              | 386/1288 [03:35<07:36,  1.97it/s] 30%|█████████████████████████████████▋                                                                              | 387/1288 [03:35<07:27,  2.01it/s] 30%|█████████████████████████████████▋                                                                              | 388/1288 [03:36<07:39,  1.96it/s] 30%|█████████████████████████████████▊                                                                              | 389/1288 [03:37<07:45,  1.93it/s] 30%|█████████████████████████████████▉                                                                              | 390/1288 [03:37<07:53,  1.90it/s]                                                                                                                                                        {'loss': 5.9749, 'grad_norm': 3.671875, 'learning_rate': 0.08358983207717285, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2710.22, 'epoch': 1.21}
 30%|█████████████████████████████████▉                                                                              | 390/1288 [03:37<07:53,  1.90it/s] 30%|██████████████████████████████████                                                                              | 391/1288 [03:38<07:47,  1.92it/s] 30%|██████████████████████████████████                                                                              | 392/1288 [03:38<07:33,  1.98it/s] 31%|██████████████████████████████████▏                                                                             | 393/1288 [03:39<07:41,  1.94it/s] 31%|██████████████████████████████████▎                                                                             | 394/1288 [03:39<07:36,  1.96it/s] 31%|██████████████████████████████████▎                                                                             | 395/1288 [03:40<07:43,  1.93it/s] 31%|██████████████████████████████████▍                                                                             | 396/1288 [03:40<07:36,  1.96it/s] 31%|██████████████████████████████████▌                                                                             | 397/1288 [03:41<07:36,  1.95it/s] 31%|██████████████████████████████████▌                                                                             | 398/1288 [03:41<07:41,  1.93it/s] 31%|██████████████████████████████████▋                                                                             | 399/1288 [03:42<07:37,  1.94it/s] 31%|██████████████████████████████████▊                                                                             | 400/1288 [03:42<07:24,  2.00it/s]                                                                                                                                                        {'loss': 6.4844, 'grad_norm': 4.34375, 'learning_rate': 0.082628264486957, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3085.71, 'epoch': 1.24}
 31%|██████████████████████████████████▊                                                                             | 400/1288 [03:42<07:24,  2.00it/s][2025-12-13 20:42:56,390] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-400
 31%|██████████████████████████████████▊                                                                             | 401/1288 [03:45<19:20,  1.31s/it] 31%|██████████████████████████████████▉                                                                             | 402/1288 [03:46<15:46,  1.07s/it] 31%|███████████████████████████████████                                                                             | 403/1288 [03:46<13:26,  1.10it/s] 31%|███████████████████████████████████▏                                                                            | 404/1288 [03:47<11:27,  1.29it/s] 31%|███████████████████████████████████▏                                                                            | 405/1288 [03:47<10:07,  1.45it/s] 32%|███████████████████████████████████▎                                                                            | 406/1288 [03:48<08:56,  1.64it/s] 32%|███████████████████████████████████▍                                                                            | 407/1288 [03:48<08:38,  1.70it/s] 32%|███████████████████████████████████▍                                                                            | 408/1288 [03:49<07:31,  1.95it/s] 32%|███████████████████████████████████▌                                                                            | 409/1288 [03:49<07:28,  1.96it/s] 32%|███████████████████████████████████▋                                                                            | 410/1288 [03:50<07:27,  1.96it/s]                                                                                                                                                        {'loss': 6.5087, 'grad_norm': 2.921875, 'learning_rate': 0.08164520340168403, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2511.25, 'epoch': 1.27}
 32%|███████████████████████████████████▋                                                                            | 410/1288 [03:50<07:27,  1.96it/s] 32%|███████████████████████████████████▋                                                                            | 411/1288 [03:50<07:20,  1.99it/s] 32%|███████████████████████████████████▊                                                                            | 412/1288 [03:51<07:05,  2.06it/s] 32%|███████████████████████████████████▉                                                                            | 413/1288 [03:51<07:11,  2.03it/s] 32%|████████████████████████████████████                                                                            | 414/1288 [03:52<06:57,  2.09it/s] 32%|████████████████████████████████████                                                                            | 415/1288 [03:52<06:59,  2.08it/s] 32%|████████████████████████████████████▏                                                                           | 416/1288 [03:53<07:05,  2.05it/s] 32%|████████████████████████████████████▎                                                                           | 417/1288 [03:53<07:11,  2.02it/s] 32%|████████████████████████████████████▎                                                                           | 418/1288 [03:54<07:04,  2.05it/s] 33%|████████████████████████████████████▍                                                                           | 419/1288 [03:54<07:02,  2.06it/s] 33%|████████████████████████████████████▌                                                                           | 420/1288 [03:54<06:59,  2.07it/s]                                                                                                                                                        {'loss': 6.4753, 'grad_norm': 3.296875, 'learning_rate': 0.08064129640157033, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2577.32, 'epoch': 1.3}
 33%|████████████████████████████████████▌                                                                           | 420/1288 [03:54<06:59,  2.07it/s] 33%|████████████████████████████████████▌                                                                           | 421/1288 [03:55<07:06,  2.03it/s] 33%|████████████████████████████████████▋                                                                           | 422/1288 [03:55<07:01,  2.05it/s] 33%|████████████████████████████████████▊                                                                           | 423/1288 [03:56<07:21,  1.96it/s] 33%|████████████████████████████████████▊                                                                           | 424/1288 [03:57<07:18,  1.97it/s] 33%|████████████████████████████████████▉                                                                           | 425/1288 [03:57<07:11,  2.00it/s] 33%|█████████████████████████████████████                                                                           | 426/1288 [03:57<07:02,  2.04it/s] 33%|█████████████████████████████████████▏                                                                          | 427/1288 [03:58<06:58,  2.06it/s] 33%|█████████████████████████████████████▏                                                                          | 428/1288 [03:58<06:56,  2.07it/s] 33%|█████████████████████████████████████▎                                                                          | 429/1288 [03:59<07:08,  2.00it/s] 33%|█████████████████████████████████████▍                                                                          | 430/1288 [04:00<07:20,  1.95it/s]                                                                                                                                                        {'loss': 6.4225, 'grad_norm': 3.65625, 'learning_rate': 0.07961720479883966, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2557.22, 'epoch': 1.33}
 33%|█████████████████████████████████████▍                                                                          | 430/1288 [04:00<07:20,  1.95it/s] 33%|█████████████████████████████████████▍                                                                          | 431/1288 [04:00<06:59,  2.04it/s] 34%|█████████████████████████████████████▌                                                                          | 432/1288 [04:00<06:54,  2.06it/s] 34%|█████████████████████████████████████▋                                                                          | 433/1288 [04:01<06:50,  2.08it/s] 34%|█████████████████████████████████████▋                                                                          | 434/1288 [04:01<07:05,  2.01it/s] 34%|█████████████████████████████████████▊                                                                          | 435/1288 [04:02<06:59,  2.03it/s] 34%|█████████████████████████████████████▉                                                                          | 436/1288 [04:02<07:05,  2.00it/s] 34%|██████████████████████████████████████                                                                          | 437/1288 [04:03<06:55,  2.05it/s] 34%|██████████████████████████████████████                                                                          | 438/1288 [04:03<06:48,  2.08it/s] 34%|██████████████████████████████████████▏                                                                         | 439/1288 [04:04<06:47,  2.09it/s] 34%|██████████████████████████████████████▎                                                                         | 440/1288 [04:04<06:57,  2.03it/s]                                                                                                                                                        {'loss': 6.4401, 'grad_norm': 3.828125, 'learning_rate': 0.07857360320209125, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2451.93, 'epoch': 1.36}
 34%|██████████████████████████████████████▎                                                                         | 440/1288 [04:04<06:57,  2.03it/s] 34%|██████████████████████████████████████▎                                                                         | 441/1288 [04:05<06:54,  2.04it/s] 34%|██████████████████████████████████████▍                                                                         | 442/1288 [04:05<06:58,  2.02it/s] 34%|██████████████████████████████████████▌                                                                         | 443/1288 [04:06<06:53,  2.05it/s] 34%|██████████████████████████████████████▌                                                                         | 444/1288 [04:06<06:54,  2.04it/s] 35%|██████████████████████████████████████▋                                                                         | 445/1288 [04:07<07:06,  1.98it/s] 35%|██████████████████████████████████████▊                                                                         | 446/1288 [04:07<07:09,  1.96it/s] 35%|██████████████████████████████████████▊                                                                         | 447/1288 [04:08<07:19,  1.91it/s] 35%|██████████████████████████████████████▉                                                                         | 448/1288 [04:08<07:20,  1.91it/s] 35%|███████████████████████████████████████                                                                         | 449/1288 [04:09<07:19,  1.91it/s] 35%|███████████████████████████████████████▏                                                                        | 450/1288 [04:09<07:07,  1.96it/s]                                                                                                                                                        {'loss': 6.464, 'grad_norm': 2.96875, 'learning_rate': 0.07751117907190919, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2964.67, 'epoch': 1.39}
 35%|███████████████████████████████████████▏                                                                        | 450/1288 [04:09<07:07,  1.96it/s] 35%|███████████████████████████████████████▏                                                                        | 451/1288 [04:10<06:59,  2.00it/s] 35%|███████████████████████████████████████▎                                                                        | 452/1288 [04:10<06:48,  2.05it/s] 35%|███████████████████████████████████████▍                                                                        | 453/1288 [04:11<06:55,  2.01it/s] 35%|███████████████████████████████████████▍                                                                        | 454/1288 [04:11<06:57,  2.00it/s] 35%|███████████████████████████████████████▌                                                                        | 455/1288 [04:12<07:04,  1.96it/s] 35%|███████████████████████████████████████▋                                                                        | 456/1288 [04:12<07:04,  1.96it/s] 35%|███████████████████████████████████████▋                                                                        | 457/1288 [04:13<07:09,  1.93it/s] 36%|███████████████████████████████████████▊                                                                        | 458/1288 [04:13<07:05,  1.95it/s] 36%|███████████████████████████████████████▉                                                                        | 459/1288 [04:14<06:45,  2.05it/s] 36%|████████████████████████████████████████                                                                        | 460/1288 [04:14<06:42,  2.06it/s]                                                                                                                                                        {'loss': 6.3442, 'grad_norm': 2.984375, 'learning_rate': 0.07643063226800556, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2760.04, 'epoch': 1.42}
 36%|████████████████████████████████████████                                                                        | 460/1288 [04:14<06:42,  2.06it/s] 36%|████████████████████████████████████████                                                                        | 461/1288 [04:15<06:37,  2.08it/s] 36%|████████████████████████████████████████▏                                                                       | 462/1288 [04:15<06:44,  2.04it/s] 36%|████████████████████████████████████████▎                                                                       | 463/1288 [04:16<06:55,  1.99it/s] 36%|████████████████████████████████████████▎                                                                       | 464/1288 [04:16<06:55,  1.98it/s] 36%|████████████████████████████████████████▍                                                                       | 465/1288 [04:17<06:55,  1.98it/s] 36%|████████████████████████████████████████▌                                                                       | 466/1288 [04:17<06:47,  2.02it/s] 36%|████████████████████████████████████████▌                                                                       | 467/1288 [04:18<07:00,  1.95it/s] 36%|████████████████████████████████████████▋                                                                       | 468/1288 [04:18<06:51,  1.99it/s] 36%|████████████████████████████████████████▊                                                                       | 469/1288 [04:19<06:58,  1.96it/s] 36%|████████████████████████████████████████▊                                                                       | 470/1288 [04:19<06:55,  1.97it/s]                                                                                                                                                        {'loss': 6.4986, 'grad_norm': 2.96875, 'learning_rate': 0.07533267458819598, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2793.22, 'epoch': 1.46}
 36%|████████████████████████████████████████▊                                                                       | 470/1288 [04:19<06:55,  1.97it/s] 37%|████████████████████████████████████████▉                                                                       | 471/1288 [04:20<06:53,  1.98it/s] 37%|█████████████████████████████████████████                                                                       | 472/1288 [04:20<06:45,  2.01it/s] 37%|█████████████████████████████████████████▏                                                                      | 473/1288 [04:21<06:48,  2.00it/s] 37%|█████████████████████████████████████████▏                                                                      | 474/1288 [04:21<06:41,  2.03it/s] 37%|█████████████████████████████████████████▎                                                                      | 475/1288 [04:22<06:50,  1.98it/s] 37%|█████████████████████████████████████████▍                                                                      | 476/1288 [04:22<06:40,  2.03it/s] 37%|█████████████████████████████████████████▍                                                                      | 477/1288 [04:23<06:43,  2.01it/s] 37%|█████████████████████████████████████████▌                                                                      | 478/1288 [04:23<06:45,  2.00it/s] 37%|█████████████████████████████████████████▋                                                                      | 479/1288 [04:24<06:49,  1.98it/s] 37%|█████████████████████████████████████████▋                                                                      | 480/1288 [04:24<06:48,  1.98it/s]                                                                                                                                                        {'loss': 6.3065, 'grad_norm': 2.84375, 'learning_rate': 0.07421802929951088, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2766.29, 'epoch': 1.49}
 37%|█████████████████████████████████████████▋                                                                      | 480/1288 [04:24<06:48,  1.98it/s] 37%|█████████████████████████████████████████▊                                                                      | 481/1288 [04:25<06:55,  1.94it/s] 37%|█████████████████████████████████████████▉                                                                      | 482/1288 [04:26<06:56,  1.94it/s] 38%|██████████████████████████████████████████                                                                      | 483/1288 [04:26<07:00,  1.91it/s] 38%|██████████████████████████████████████████                                                                      | 484/1288 [04:27<07:09,  1.87it/s] 38%|██████████████████████████████████████████▏                                                                     | 485/1288 [04:27<06:54,  1.94it/s] 38%|██████████████████████████████████████████▎                                                                     | 486/1288 [04:28<06:54,  1.94it/s] 38%|██████████████████████████████████████████▎                                                                     | 487/1288 [04:28<06:57,  1.92it/s] 38%|██████████████████████████████████████████▍                                                                     | 488/1288 [04:29<06:45,  1.97it/s] 38%|██████████████████████████████████████████▌                                                                     | 489/1288 [04:29<06:47,  1.96it/s] 38%|██████████████████████████████████████████▌                                                                     | 490/1288 [04:30<06:39,  2.00it/s]                                                                                                                                                        {'loss': 6.3311, 'grad_norm': 3.109375, 'learning_rate': 0.0730874306617517, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3121.39, 'epoch': 1.52}
 38%|██████████████████████████████████████████▌                                                                     | 490/1288 [04:30<06:39,  2.00it/s] 38%|██████████████████████████████████████████▋                                                                     | 491/1288 [04:30<06:51,  1.94it/s] 38%|██████████████████████████████████████████▊                                                                     | 492/1288 [04:31<06:39,  1.99it/s] 38%|██████████████████████████████████████████▊                                                                     | 493/1288 [04:31<06:48,  1.95it/s] 38%|██████████████████████████████████████████▉                                                                     | 494/1288 [04:32<07:01,  1.88it/s] 38%|███████████████████████████████████████████                                                                     | 495/1288 [04:32<07:04,  1.87it/s] 39%|███████████████████████████████████████████▏                                                                    | 496/1288 [04:33<06:55,  1.91it/s] 39%|███████████████████████████████████████████▏                                                                    | 497/1288 [04:33<06:50,  1.93it/s] 39%|███████████████████████████████████████████▎                                                                    | 498/1288 [04:34<06:56,  1.90it/s] 39%|███████████████████████████████████████████▍                                                                    | 499/1288 [04:34<06:50,  1.92it/s] 39%|███████████████████████████████████████████▍                                                                    | 500/1288 [04:35<06:44,  1.95it/s]                                                                                                                                                        {'loss': 6.2641, 'grad_norm': 4.1875, 'learning_rate': 0.0719416234438056, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3107.7, 'epoch': 1.55}
 39%|███████████████████████████████████████████▍                                                                    | 500/1288 [04:35<06:44,  1.95it/s][2025-12-13 20:43:49,088] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-500
 39%|███████████████████████████████████████████▌                                                                    | 501/1288 [04:38<17:37,  1.34s/it] 39%|███████████████████████████████████████████▋                                                                    | 502/1288 [04:39<14:18,  1.09s/it] 39%|███████████████████████████████████████████▋                                                                    | 503/1288 [04:39<12:09,  1.08it/s] 39%|███████████████████████████████████████████▊                                                                    | 504/1288 [04:40<10:26,  1.25it/s] 39%|███████████████████████████████████████████▉                                                                    | 505/1288 [04:40<09:23,  1.39it/s] 39%|████████████████████████████████████████████                                                                    | 506/1288 [04:41<08:39,  1.51it/s] 39%|████████████████████████████████████████████                                                                    | 507/1288 [04:41<07:52,  1.65it/s] 39%|████████████████████████████████████████████▏                                                                   | 508/1288 [04:42<07:23,  1.76it/s] 40%|████████████████████████████████████████████▎                                                                   | 509/1288 [04:42<07:16,  1.78it/s] 40%|████████████████████████████████████████████▎                                                                   | 510/1288 [04:43<06:56,  1.87it/s]                                                                                                                                                        {'loss': 6.3479, 'grad_norm': 3.15625, 'learning_rate': 0.07078136243303754, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3030.94, 'epoch': 1.58}
 40%|████████████████████████████████████████████▎                                                                   | 510/1288 [04:43<06:56,  1.87it/s] 40%|████████████████████████████████████████████▍                                                                   | 511/1288 [04:43<06:51,  1.89it/s] 40%|████████████████████████████████████████████▌                                                                   | 512/1288 [04:44<06:55,  1.87it/s] 40%|████████████████████████████████████████████▌                                                                   | 513/1288 [04:44<06:41,  1.93it/s] 40%|████████████████████████████████████████████▋                                                                   | 514/1288 [04:45<06:45,  1.91it/s] 40%|████████████████████████████████████████████▊                                                                   | 515/1288 [04:45<06:33,  1.97it/s] 40%|████████████████████████████████████████████▊                                                                   | 516/1288 [04:46<06:23,  2.01it/s] 40%|████████████████████████████████████████████▉                                                                   | 517/1288 [04:46<06:23,  2.01it/s] 40%|█████████████████████████████████████████████                                                                   | 518/1288 [04:47<06:32,  1.96it/s] 40%|█████████████████████████████████████████████▏                                                                  | 519/1288 [04:47<06:28,  1.98it/s] 40%|█████████████████████████████████████████████▏                                                                  | 520/1288 [04:48<06:19,  2.02it/s]                                                                                                                                                        {'loss': 6.3151, 'grad_norm': 3.078125, 'learning_rate': 0.06960741193808273, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2984.92, 'epoch': 1.61}
 40%|█████████████████████████████████████████████▏                                                                  | 520/1288 [04:48<06:19,  2.02it/s] 40%|█████████████████████████████████████████████▎                                                                  | 521/1288 [04:48<06:23,  2.00it/s] 41%|█████████████████████████████████████████████▍                                                                  | 522/1288 [04:49<06:20,  2.01it/s] 41%|█████████████████████████████████████████████▍                                                                  | 523/1288 [04:49<06:15,  2.04it/s] 41%|█████████████████████████████████████████████▌                                                                  | 524/1288 [04:50<06:09,  2.07it/s] 41%|█████████████████████████████████████████████▋                                                                  | 525/1288 [04:50<06:07,  2.08it/s] 41%|█████████████████████████████████████████████▋                                                                  | 526/1288 [04:51<06:04,  2.09it/s] 41%|█████████████████████████████████████████████▊                                                                  | 527/1288 [04:51<06:17,  2.01it/s] 41%|█████████████████████████████████████████████▉                                                                  | 528/1288 [04:52<06:10,  2.05it/s] 41%|██████████████████████████████████████████████                                                                  | 529/1288 [04:52<06:15,  2.02it/s] 41%|██████████████████████████████████████████████                                                                  | 530/1288 [04:52<05:36,  2.25it/s]                                                                                                                                                        {'loss': 6.4469, 'grad_norm': 8.875, 'learning_rate': 0.06842054528536716, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3648.13, 'epoch': 1.64}
 41%|██████████████████████████████████████████████                                                                  | 530/1288 [04:52<05:36,  2.25it/s] 41%|██████████████████████████████████████████████▏                                                                 | 531/1288 [04:53<05:49,  2.17it/s] 41%|██████████████████████████████████████████████▎                                                                 | 532/1288 [04:53<05:48,  2.17it/s] 41%|██████████████████████████████████████████████▎                                                                 | 533/1288 [04:54<06:03,  2.08it/s] 41%|██████████████████████████████████████████████▍                                                                 | 534/1288 [04:55<06:17,  2.00it/s] 42%|██████████████████████████████████████████████▌                                                                 | 535/1288 [04:55<06:20,  1.98it/s] 42%|██████████████████████████████████████████████▌                                                                 | 536/1288 [04:56<06:13,  2.01it/s] 42%|██████████████████████████████████████████████▋                                                                 | 537/1288 [04:56<06:23,  1.96it/s] 42%|██████████████████████████████████████████████▊                                                                 | 538/1288 [04:57<06:13,  2.01it/s] 42%|██████████████████████████████████████████████▊                                                                 | 539/1288 [04:57<06:12,  2.01it/s] 42%|██████████████████████████████████████████████▉                                                                 | 540/1288 [04:58<06:07,  2.03it/s]                                                                                                                                                        {'loss': 6.6141, 'grad_norm': 2.765625, 'learning_rate': 0.06722154430968755, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2837.68, 'epoch': 1.67}
 42%|██████████████████████████████████████████████▉                                                                 | 540/1288 [04:58<06:07,  2.03it/s] 42%|███████████████████████████████████████████████                                                                 | 541/1288 [04:58<06:09,  2.02it/s] 42%|███████████████████████████████████████████████▏                                                                | 542/1288 [04:59<06:21,  1.96it/s] 42%|███████████████████████████████████████████████▏                                                                | 543/1288 [04:59<06:33,  1.89it/s] 42%|███████████████████████████████████████████████▎                                                                | 544/1288 [05:00<06:27,  1.92it/s] 42%|███████████████████████████████████████████████▍                                                                | 545/1288 [05:00<06:33,  1.89it/s] 42%|███████████████████████████████████████████████▍                                                                | 546/1288 [05:01<06:34,  1.88it/s] 42%|███████████████████████████████████████████████▌                                                                | 547/1288 [05:01<06:35,  1.88it/s] 43%|███████████████████████████████████████████████▋                                                                | 548/1288 [05:02<06:34,  1.87it/s] 43%|███████████████████████████████████████████████▋                                                                | 549/1288 [05:02<06:28,  1.90it/s] 43%|███████████████████████████████████████████████▊                                                                | 550/1288 [05:03<06:32,  1.88it/s]                                                                                                                                                        {'loss': 6.1934, 'grad_norm': 4.0625, 'learning_rate': 0.06601119883918677, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2895.51, 'epoch': 1.7}
 43%|███████████████████████████████████████████████▊                                                                | 550/1288 [05:03<06:32,  1.88it/s] 43%|███████████████████████████████████████████████▉                                                                | 551/1288 [05:03<06:21,  1.93it/s] 43%|████████████████████████████████████████████████                                                                | 552/1288 [05:04<06:18,  1.94it/s] 43%|████████████████████████████████████████████████                                                                | 553/1288 [05:04<06:11,  1.98it/s] 43%|████████████████████████████████████████████████▏                                                               | 554/1288 [05:05<06:08,  1.99it/s] 43%|████████████████████████████████████████████████▎                                                               | 555/1288 [05:05<06:15,  1.95it/s] 43%|████████████████████████████████████████████████▎                                                               | 556/1288 [05:06<06:12,  1.96it/s] 43%|████████████████████████████████████████████████▍                                                               | 557/1288 [05:06<06:03,  2.01it/s] 43%|████████████████████████████████████████████████▌                                                               | 558/1288 [05:07<06:11,  1.96it/s] 43%|████████████████████████████████████████████████▌                                                               | 559/1288 [05:07<06:11,  1.96it/s] 43%|████████████████████████████████████████████████▋                                                               | 560/1288 [05:08<06:15,  1.94it/s]                                                                                                                                                        {'loss': 6.5664, 'grad_norm': 6.75, 'learning_rate': 0.06479030617506353, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2552.3, 'epoch': 1.73}
 43%|████████████████████████████████████████████████▋                                                               | 560/1288 [05:08<06:15,  1.94it/s] 44%|████████████████████████████████████████████████▊                                                               | 561/1288 [05:08<05:56,  2.04it/s] 44%|████████████████████████████████████████████████▊                                                               | 562/1288 [05:09<05:50,  2.07it/s] 44%|████████████████████████████████████████████████▉                                                               | 563/1288 [05:09<05:57,  2.03it/s] 44%|█████████████████████████████████████████████████                                                               | 564/1288 [05:10<06:01,  2.00it/s] 44%|█████████████████████████████████████████████████▏                                                              | 565/1288 [05:10<06:03,  1.99it/s] 44%|█████████████████████████████████████████████████▏                                                              | 566/1288 [05:11<05:57,  2.02it/s] 44%|█████████████████████████████████████████████████▎                                                              | 567/1288 [05:11<06:05,  1.97it/s] 44%|█████████████████████████████████████████████████▍                                                              | 568/1288 [05:12<06:11,  1.94it/s] 44%|█████████████████████████████████████████████████▍                                                              | 569/1288 [05:12<06:17,  1.90it/s] 44%|█████████████████████████████████████████████████▌                                                              | 570/1288 [05:13<06:12,  1.93it/s]                                                                                                                                                        {'loss': 6.2367, 'grad_norm': 2.625, 'learning_rate': 0.0635596705663594, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2747.54, 'epoch': 1.76}
 44%|█████████████████████████████████████████████████▌                                                              | 570/1288 [05:13<06:12,  1.93it/s] 44%|█████████████████████████████████████████████████▋                                                              | 571/1288 [05:13<06:07,  1.95it/s] 44%|█████████████████████████████████████████████████▋                                                              | 572/1288 [05:14<05:59,  1.99it/s] 44%|█████████████████████████████████████████████████▊                                                              | 573/1288 [05:14<06:10,  1.93it/s] 45%|█████████████████████████████████████████████████▉                                                              | 574/1288 [05:15<05:59,  1.99it/s] 45%|██████████████████████████████████████████████████                                                              | 575/1288 [05:15<06:06,  1.94it/s] 45%|██████████████████████████████████████████████████                                                              | 576/1288 [05:16<05:59,  1.98it/s] 45%|██████████████████████████████████████████████████▏                                                             | 577/1288 [05:16<06:07,  1.93it/s] 45%|██████████████████████████████████████████████████▎                                                             | 578/1288 [05:17<06:04,  1.95it/s] 45%|██████████████████████████████████████████████████▎                                                             | 579/1288 [05:17<05:55,  1.99it/s] 45%|██████████████████████████████████████████████████▍                                                             | 580/1288 [05:18<05:53,  2.00it/s]                                                                                                                                                        {'loss': 6.686, 'grad_norm': 3.875, 'learning_rate': 0.062320102680168944, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2769.21, 'epoch': 1.8}
 45%|██████████████████████████████████████████████████▍                                                             | 580/1288 [05:18<05:53,  2.00it/s] 45%|██████████████████████████████████████████████████▌                                                             | 581/1288 [05:18<05:55,  1.99it/s] 45%|██████████████████████████████████████████████████▌                                                             | 582/1288 [05:19<05:53,  2.00it/s] 45%|██████████████████████████████████████████████████▋                                                             | 583/1288 [05:19<05:48,  2.02it/s] 45%|██████████████████████████████████████████████████▊                                                             | 584/1288 [05:20<05:57,  1.97it/s] 45%|██████████████████████████████████████████████████▊                                                             | 585/1288 [05:20<05:50,  2.00it/s] 45%|██████████████████████████████████████████████████▉                                                             | 586/1288 [05:21<05:43,  2.05it/s] 46%|███████████████████████████████████████████████████                                                             | 587/1288 [05:21<05:40,  2.06it/s] 46%|███████████████████████████████████████████████████▏                                                            | 588/1288 [05:22<05:50,  2.00it/s] 46%|███████████████████████████████████████████████████▏                                                            | 589/1288 [05:23<05:59,  1.94it/s] 46%|███████████████████████████████████████████████████▎                                                            | 590/1288 [05:23<06:06,  1.91it/s]                                                                                                                                                        {'loss': 6.1809, 'grad_norm': 4.21875, 'learning_rate': 0.06107241906762215, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2668.32, 'epoch': 1.83}
 46%|███████████████████████████████████████████████████▎                                                            | 590/1288 [05:23<06:06,  1.91it/s] 46%|███████████████████████████████████████████████████▍                                                            | 591/1288 [05:24<05:55,  1.96it/s] 46%|███████████████████████████████████████████████████▍                                                            | 592/1288 [05:24<05:52,  1.98it/s] 46%|███████████████████████████████████████████████████▌                                                            | 593/1288 [05:25<05:53,  1.97it/s] 46%|███████████████████████████████████████████████████▋                                                            | 594/1288 [05:25<05:45,  2.01it/s] 46%|███████████████████████████████████████████████████▋                                                            | 595/1288 [05:25<05:41,  2.03it/s] 46%|███████████████████████████████████████████████████▊                                                            | 596/1288 [05:26<05:19,  2.16it/s] 46%|███████████████████████████████████████████████████▉                                                            | 597/1288 [05:26<05:28,  2.11it/s] 46%|████████████████████████████████████████████████████                                                            | 598/1288 [05:27<05:28,  2.10it/s] 47%|████████████████████████████████████████████████████                                                            | 599/1288 [05:27<05:33,  2.07it/s] 47%|████████████████████████████████████████████████████▏                                                           | 600/1288 [05:28<05:37,  2.04it/s]                                                                                                                                                        {'loss': 6.5976, 'grad_norm': 3.578125, 'learning_rate': 0.059817441625990564, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2375.01, 'epoch': 1.86}
 47%|████████████████████████████████████████████████████▏                                                           | 600/1288 [05:28<05:37,  2.04it/s][2025-12-13 20:44:42,105] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-600
 47%|████████████████████████████████████████████████████▎                                                           | 601/1288 [05:31<15:27,  1.35s/it] 47%|████████████████████████████████████████████████████▎                                                           | 602/1288 [05:32<12:31,  1.10s/it] 47%|████████████████████████████████████████████████████▍                                                           | 603/1288 [05:32<10:36,  1.08it/s] 47%|████████████████████████████████████████████████████▌                                                           | 604/1288 [05:33<09:20,  1.22it/s] 47%|████████████████████████████████████████████████████▌                                                           | 605/1288 [05:33<08:18,  1.37it/s] 47%|████████████████████████████████████████████████████▋                                                           | 606/1288 [05:34<07:40,  1.48it/s] 47%|████████████████████████████████████████████████████▊                                                           | 607/1288 [05:34<07:05,  1.60it/s] 47%|████████████████████████████████████████████████████▊                                                           | 608/1288 [05:35<06:39,  1.70it/s] 47%|████████████████████████████████████████████████████▉                                                           | 609/1288 [05:35<06:15,  1.81it/s] 47%|█████████████████████████████████████████████████████                                                           | 610/1288 [05:36<06:08,  1.84it/s]                                                                                                                                                        {'loss': 6.3878, 'grad_norm': 3.015625, 'learning_rate': 0.058555997057272116, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2894.95, 'epoch': 1.89}
 47%|█████████████████████████████████████████████████████                                                           | 610/1288 [05:36<06:08,  1.84it/s] 47%|█████████████████████████████████████████████████████▏                                                          | 611/1288 [05:36<06:01,  1.87it/s] 48%|█████████████████████████████████████████████████████▏                                                          | 612/1288 [05:37<05:54,  1.91it/s] 48%|█████████████████████████████████████████████████████▎                                                          | 613/1288 [05:37<05:51,  1.92it/s] 48%|█████████████████████████████████████████████████████▍                                                          | 614/1288 [05:38<05:48,  1.93it/s] 48%|█████████████████████████████████████████████████████▍                                                          | 615/1288 [05:38<05:52,  1.91it/s] 48%|█████████████████████████████████████████████████████▌                                                          | 616/1288 [05:39<05:48,  1.93it/s] 48%|█████████████████████████████████████████████████████▋                                                          | 617/1288 [05:40<05:52,  1.90it/s] 48%|█████████████████████████████████████████████████████▋                                                          | 618/1288 [05:40<05:41,  1.96it/s] 48%|█████████████████████████████████████████████████████▊                                                          | 619/1288 [05:40<05:32,  2.01it/s] 48%|█████████████████████████████████████████████████████▉                                                          | 620/1288 [05:41<05:26,  2.05it/s]                                                                                                                                                        {'loss': 6.307, 'grad_norm': 3.171875, 'learning_rate': 0.05728891632361043, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2924.97, 'epoch': 1.92}
 48%|█████████████████████████████████████████████████████▉                                                          | 620/1288 [05:41<05:26,  2.05it/s] 48%|██████████████████████████████████████████████████████                                                          | 621/1288 [05:41<05:23,  2.06it/s] 48%|██████████████████████████████████████████████████████                                                          | 622/1288 [05:42<05:33,  2.00it/s] 48%|██████████████████████████████████████████████████████▏                                                         | 623/1288 [05:42<05:36,  1.98it/s] 48%|██████████████████████████████████████████████████████▎                                                         | 624/1288 [05:43<05:16,  2.10it/s] 49%|██████████████████████████████████████████████████████▎                                                         | 625/1288 [05:43<05:23,  2.05it/s] 49%|██████████████████████████████████████████████████████▍                                                         | 626/1288 [05:44<05:18,  2.08it/s] 49%|██████████████████████████████████████████████████████▌                                                         | 627/1288 [05:44<05:31,  2.00it/s] 49%|██████████████████████████████████████████████████████▌                                                         | 628/1288 [05:45<05:31,  1.99it/s] 49%|██████████████████████████████████████████████████████▋                                                         | 629/1288 [05:45<05:31,  1.99it/s] 49%|██████████████████████████████████████████████████████▊                                                         | 630/1288 [05:46<05:38,  1.94it/s]                                                                                                                                                        {'loss': 6.3297, 'grad_norm': 3.328125, 'learning_rate': 0.05601703409990824, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2534.47, 'epoch': 1.95}
 49%|██████████████████████████████████████████████████████▊                                                         | 630/1288 [05:46<05:38,  1.94it/s] 49%|██████████████████████████████████████████████████████▊                                                         | 631/1288 [05:46<05:37,  1.95it/s] 49%|██████████████████████████████████████████████████████▉                                                         | 632/1288 [05:47<05:29,  1.99it/s] 49%|███████████████████████████████████████████████████████                                                         | 633/1288 [05:47<05:24,  2.02it/s] 49%|███████████████████████████████████████████████████████▏                                                        | 634/1288 [05:48<05:20,  2.04it/s] 49%|███████████████████████████████████████████████████████▏                                                        | 635/1288 [05:48<05:35,  1.95it/s] 49%|███████████████████████████████████████████████████████▎                                                        | 636/1288 [05:49<05:27,  1.99it/s] 49%|███████████████████████████████████████████████████████▍                                                        | 637/1288 [05:49<05:22,  2.02it/s] 50%|███████████████████████████████████████████████████████▍                                                        | 638/1288 [05:50<05:26,  1.99it/s] 50%|███████████████████████████████████████████████████████▌                                                        | 639/1288 [05:50<05:14,  2.06it/s] 50%|███████████████████████████████████████████████████████▋                                                        | 640/1288 [05:51<05:25,  1.99it/s]                                                                                                                                                        {'loss': 6.5449, 'grad_norm': 3.15625, 'learning_rate': 0.054741188223994756, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2444.72, 'epoch': 1.98}
 50%|███████████████████████████████████████████████████████▋                                                        | 640/1288 [05:51<05:25,  1.99it/s] 50%|███████████████████████████████████████████████████████▋                                                        | 641/1288 [05:51<05:19,  2.02it/s] 50%|███████████████████████████████████████████████████████▊                                                        | 642/1288 [05:52<05:29,  1.96it/s] 50%|███████████████████████████████████████████████████████▉                                                        | 643/1288 [05:53<05:39,  1.90it/s] 50%|████████████████████████████████████████████████████████                                                        | 644/1288 [05:53<05:34,  1.92it/s] 50%|████████████████████████████████████████████████████████                                                        | 645/1288 [05:53<05:24,  1.98it/s] 50%|████████████████████████████████████████████████████████▏                                                       | 646/1288 [05:54<05:15,  2.03it/s] 50%|████████████████████████████████████████████████████████▎                                                       | 647/1288 [05:56<08:41,  1.23it/s] 50%|████████████████████████████████████████████████████████▎                                                       | 648/1288 [05:56<07:43,  1.38it/s] 50%|████████████████████████████████████████████████████████▍                                                       | 649/1288 [05:57<07:05,  1.50it/s] 50%|████████████████████████████████████████████████████████▌                                                       | 650/1288 [05:57<06:28,  1.64it/s]                                                                                                                                                        {'loss': 6.3549, 'grad_norm': 3.875, 'learning_rate': 0.05346221914470959, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2934.56, 'epoch': 2.01}
 50%|████████████████████████████████████████████████████████▌                                                       | 650/1288 [05:57<06:28,  1.64it/s] 51%|████████████████████████████████████████████████████████▌                                                       | 651/1288 [05:58<06:02,  1.76it/s] 51%|████████████████████████████████████████████████████████▋                                                       | 652/1288 [05:58<05:55,  1.79it/s] 51%|████████████████████████████████████████████████████████▊                                                       | 653/1288 [05:59<05:53,  1.80it/s] 51%|████████████████████████████████████████████████████████▊                                                       | 654/1288 [05:59<05:37,  1.88it/s] 51%|████████████████████████████████████████████████████████▉                                                       | 655/1288 [06:00<05:18,  1.99it/s] 51%|█████████████████████████████████████████████████████████                                                       | 656/1288 [06:00<05:13,  2.02it/s] 51%|█████████████████████████████████████████████████████████▏                                                      | 657/1288 [06:01<05:16,  1.99it/s] 51%|█████████████████████████████████████████████████████████▏                                                      | 658/1288 [06:01<05:28,  1.92it/s] 51%|█████████████████████████████████████████████████████████▎                                                      | 659/1288 [06:02<05:25,  1.93it/s] 51%|█████████████████████████████████████████████████████████▍                                                      | 660/1288 [06:02<05:30,  1.90it/s]                                                                                                                                                        {'loss': 6.2473, 'grad_norm': 3.890625, 'learning_rate': 0.05218096936826681, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2664.64, 'epoch': 2.04}
 51%|█████████████████████████████████████████████████████████▍                                                      | 660/1288 [06:02<05:30,  1.90it/s] 51%|█████████████████████████████████████████████████████████▍                                                      | 661/1288 [06:03<05:20,  1.96it/s] 51%|█████████████████████████████████████████████████████████▌                                                      | 662/1288 [06:03<05:24,  1.93it/s] 51%|█████████████████████████████████████████████████████████▋                                                      | 663/1288 [06:04<05:22,  1.94it/s] 52%|█████████████████████████████████████████████████████████▋                                                      | 664/1288 [06:04<05:17,  1.97it/s] 52%|█████████████████████████████████████████████████████████▊                                                      | 665/1288 [06:05<05:23,  1.93it/s] 52%|█████████████████████████████████████████████████████████▉                                                      | 666/1288 [06:05<05:28,  1.90it/s] 52%|██████████████████████████████████████████████████████████                                                      | 667/1288 [06:06<05:12,  1.99it/s] 52%|██████████████████████████████████████████████████████████                                                      | 668/1288 [06:06<05:01,  2.06it/s] 52%|██████████████████████████████████████████████████████████▏                                                     | 669/1288 [06:07<04:59,  2.07it/s] 52%|██████████████████████████████████████████████████████████▎                                                     | 670/1288 [06:07<04:57,  2.08it/s]                                                                                                                                                        {'loss': 6.3402, 'grad_norm': 3.015625, 'learning_rate': 0.050898282903263536, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2834.09, 'epoch': 2.07}
 52%|██████████████████████████████████████████████████████████▎                                                     | 670/1288 [06:07<04:57,  2.08it/s] 52%|██████████████████████████████████████████████████████████▎                                                     | 671/1288 [06:08<05:00,  2.05it/s] 52%|██████████████████████████████████████████████████████████▍                                                     | 672/1288 [06:08<05:12,  1.97it/s] 52%|██████████████████████████████████████████████████████████▌                                                     | 673/1288 [06:09<05:10,  1.98it/s] 52%|██████████████████████████████████████████████████████████▌                                                     | 674/1288 [06:09<05:13,  1.96it/s] 52%|██████████████████████████████████████████████████████████▋                                                     | 675/1288 [06:10<05:13,  1.95it/s] 52%|██████████████████████████████████████████████████████████▊                                                     | 676/1288 [06:10<05:13,  1.95it/s] 53%|██████████████████████████████████████████████████████████▊                                                     | 677/1288 [06:11<05:05,  2.00it/s] 53%|██████████████████████████████████████████████████████████▉                                                     | 678/1288 [06:11<05:07,  1.98it/s] 53%|███████████████████████████████████████████████████████████                                                     | 679/1288 [06:12<05:09,  1.97it/s] 53%|███████████████████████████████████████████████████████████▏                                                    | 680/1288 [06:12<05:15,  1.93it/s]                                                                                                                                                        {'loss': 6.3827, 'grad_norm': 3.09375, 'learning_rate': 0.049615004704699074, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2670.66, 'epoch': 2.11}
 53%|███████████████████████████████████████████████████████████▏                                                    | 680/1288 [06:12<05:15,  1.93it/s] 53%|███████████████████████████████████████████████████████████▏                                                    | 681/1288 [06:13<05:25,  1.87it/s] 53%|███████████████████████████████████████████████████████████▎                                                    | 682/1288 [06:13<05:13,  1.93it/s] 53%|███████████████████████████████████████████████████████████▍                                                    | 683/1288 [06:14<05:22,  1.88it/s] 53%|███████████████████████████████████████████████████████████▍                                                    | 684/1288 [06:14<05:24,  1.86it/s] 53%|███████████████████████████████████████████████████████████▌                                                    | 685/1288 [06:15<05:19,  1.89it/s] 53%|███████████████████████████████████████████████████████████▋                                                    | 686/1288 [06:15<05:10,  1.94it/s] 53%|███████████████████████████████████████████████████████████▋                                                    | 687/1288 [06:16<05:08,  1.95it/s] 53%|███████████████████████████████████████████████████████████▊                                                    | 688/1288 [06:16<05:14,  1.91it/s] 53%|███████████████████████████████████████████████████████████▉                                                    | 689/1288 [06:17<05:04,  1.96it/s] 54%|████████████████████████████████████████████████████████████                                                    | 690/1288 [06:17<05:13,  1.91it/s]                                                                                                                                                        {'loss': 6.3279, 'grad_norm': 3.59375, 'learning_rate': 0.048331980117370346, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2701.62, 'epoch': 2.14}
 54%|████████████████████████████████████████████████████████████                                                    | 690/1288 [06:17<05:13,  1.91it/s] 54%|████████████████████████████████████████████████████████████                                                    | 691/1288 [06:18<05:06,  1.95it/s] 54%|████████████████████████████████████████████████████████████▏                                                   | 692/1288 [06:19<05:10,  1.92it/s] 54%|████████████████████████████████████████████████████████████▎                                                   | 693/1288 [06:19<05:03,  1.96it/s] 54%|████████████████████████████████████████████████████████████▎                                                   | 694/1288 [06:19<04:57,  2.00it/s] 54%|████████████████████████████████████████████████████████████▍                                                   | 695/1288 [06:20<04:57,  1.99it/s] 54%|████████████████████████████████████████████████████████████▌                                                   | 696/1288 [06:20<04:52,  2.02it/s] 54%|████████████████████████████████████████████████████████████▌                                                   | 697/1288 [06:21<04:56,  2.00it/s] 54%|████████████████████████████████████████████████████████████▋                                                   | 698/1288 [06:21<04:54,  2.00it/s] 54%|████████████████████████████████████████████████████████████▊                                                   | 699/1288 [06:22<04:50,  2.03it/s] 54%|████████████████████████████████████████████████████████████▊                                                   | 700/1288 [06:22<04:52,  2.01it/s]                                                                                                                                                        {'loss': 6.5604, 'grad_norm': 3.0, 'learning_rate': 0.0470500543190108, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2597.75, 'epoch': 2.17}
 54%|████████████████████████████████████████████████████████████▊                                                   | 700/1288 [06:22<04:52,  2.01it/s][2025-12-13 20:45:36,681] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-700
 54%|████████████████████████████████████████████████████████████▉                                                   | 701/1288 [06:26<12:22,  1.27s/it] 55%|█████████████████████████████████████████████████████████████                                                   | 702/1288 [06:26<10:07,  1.04s/it] 55%|█████████████████████████████████████████████████████████████▏                                                  | 703/1288 [06:26<08:28,  1.15it/s] 55%|█████████████████████████████████████████████████████████████▏                                                  | 704/1288 [06:27<07:19,  1.33it/s] 55%|█████████████████████████████████████████████████████████████▎                                                  | 705/1288 [06:27<06:34,  1.48it/s] 55%|█████████████████████████████████████████████████████████████▍                                                  | 706/1288 [06:28<06:00,  1.62it/s] 55%|█████████████████████████████████████████████████████████████▍                                                  | 707/1288 [06:28<05:45,  1.68it/s] 55%|█████████████████████████████████████████████████████████████▌                                                  | 708/1288 [06:29<05:24,  1.79it/s] 55%|█████████████████████████████████████████████████████████████▋                                                  | 709/1288 [06:29<05:14,  1.84it/s] 55%|█████████████████████████████████████████████████████████████▋                                                  | 710/1288 [06:30<05:07,  1.88it/s]                                                                                                                                                        {'loss': 6.5682, 'grad_norm': 3.28125, 'learning_rate': 0.04577007176353931, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2591.56, 'epoch': 2.2}
 55%|█████████████████████████████████████████████████████████████▋                                                  | 710/1288 [06:30<05:07,  1.88it/s] 55%|█████████████████████████████████████████████████████████████▊                                                  | 711/1288 [06:31<05:13,  1.84it/s] 55%|█████████████████████████████████████████████████████████████▉                                                  | 712/1288 [06:31<05:13,  1.84it/s] 55%|██████████████████████████████████████████████████████████████                                                  | 713/1288 [06:32<05:00,  1.92it/s] 55%|██████████████████████████████████████████████████████████████                                                  | 714/1288 [06:32<05:01,  1.90it/s] 56%|██████████████████████████████████████████████████████████████▏                                                 | 715/1288 [06:33<05:03,  1.89it/s] 56%|██████████████████████████████████████████████████████████████▎                                                 | 716/1288 [06:33<04:53,  1.95it/s] 56%|██████████████████████████████████████████████████████████████▎                                                 | 717/1288 [06:34<04:45,  2.00it/s] 56%|██████████████████████████████████████████████████████████████▍                                                 | 718/1288 [06:34<04:40,  2.03it/s] 56%|██████████████████████████████████████████████████████████████▌                                                 | 719/1288 [06:35<04:38,  2.04it/s] 56%|██████████████████████████████████████████████████████████████▌                                                 | 720/1288 [06:35<04:39,  2.03it/s]                                                                                                                                                        {'loss': 6.2766, 'grad_norm': 4.1875, 'learning_rate': 0.0444928756247857, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2830.22, 'epoch': 2.23}
 56%|██████████████████████████████████████████████████████████████▌                                                 | 720/1288 [06:35<04:39,  2.03it/s] 56%|██████████████████████████████████████████████████████████████▋                                                 | 721/1288 [06:36<04:48,  1.97it/s] 56%|██████████████████████████████████████████████████████████████▊                                                 | 722/1288 [06:36<04:35,  2.05it/s] 56%|██████████████████████████████████████████████████████████████▊                                                 | 723/1288 [06:36<04:32,  2.07it/s] 56%|██████████████████████████████████████████████████████████████▉                                                 | 724/1288 [06:37<04:32,  2.07it/s] 56%|███████████████████████████████████████████████████████████████                                                 | 725/1288 [06:37<04:31,  2.07it/s] 56%|███████████████████████████████████████████████████████████████▏                                                | 726/1288 [06:38<04:35,  2.04it/s] 56%|███████████████████████████████████████████████████████████████▏                                                | 727/1288 [06:38<04:31,  2.07it/s] 57%|███████████████████████████████████████████████████████████████▎                                                | 728/1288 [06:39<04:14,  2.20it/s] 57%|███████████████████████████████████████████████████████████████▍                                                | 729/1288 [06:39<04:19,  2.16it/s] 57%|███████████████████████████████████████████████████████████████▍                                                | 730/1288 [06:40<04:33,  2.04it/s]                                                                                                                                                        {'loss': 6.4892, 'grad_norm': 3.765625, 'learning_rate': 0.043219307241059796, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2173.3, 'epoch': 2.26}
 57%|███████████████████████████████████████████████████████████████▍                                                | 730/1288 [06:40<04:33,  2.04it/s] 57%|███████████████████████████████████████████████████████████████▌                                                | 731/1288 [06:40<04:44,  1.96it/s] 57%|███████████████████████████████████████████████████████████████▋                                                | 732/1288 [06:41<04:47,  1.94it/s] 57%|███████████████████████████████████████████████████████████████▋                                                | 733/1288 [06:41<04:43,  1.96it/s] 57%|███████████████████████████████████████████████████████████████▊                                                | 734/1288 [06:42<04:47,  1.93it/s] 57%|███████████████████████████████████████████████████████████████▉                                                | 735/1288 [06:43<04:50,  1.90it/s] 57%|████████████████████████████████████████████████████████████████                                                | 736/1288 [06:43<04:42,  1.96it/s] 57%|████████████████████████████████████████████████████████████████                                                | 737/1288 [06:44<04:47,  1.92it/s] 57%|████████████████████████████████████████████████████████████████▏                                               | 738/1288 [06:44<04:43,  1.94it/s] 57%|████████████████████████████████████████████████████████████████▎                                               | 739/1288 [06:45<04:46,  1.91it/s] 57%|████████████████████████████████████████████████████████████████▎                                               | 740/1288 [06:45<04:39,  1.96it/s]                                                                                                                                                        {'loss': 6.161, 'grad_norm': 2.96875, 'learning_rate': 0.04195020556092935, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3152.7, 'epoch': 2.29}
 57%|████████████████████████████████████████████████████████████████▎                                               | 740/1288 [06:45<04:39,  1.96it/s] 58%|████████████████████████████████████████████████████████████████▍                                               | 741/1288 [06:46<04:39,  1.96it/s] 58%|████████████████████████████████████████████████████████████████▌                                               | 742/1288 [06:46<04:44,  1.92it/s] 58%|████████████████████████████████████████████████████████████████▌                                               | 743/1288 [06:47<04:42,  1.93it/s] 58%|████████████████████████████████████████████████████████████████▋                                               | 744/1288 [06:47<04:46,  1.90it/s] 58%|████████████████████████████████████████████████████████████████▊                                               | 745/1288 [06:48<04:37,  1.96it/s] 58%|████████████████████████████████████████████████████████████████▊                                               | 746/1288 [06:48<04:31,  1.99it/s] 58%|████████████████████████████████████████████████████████████████▉                                               | 747/1288 [06:49<04:27,  2.02it/s] 58%|█████████████████████████████████████████████████████████████████                                               | 748/1288 [06:49<04:16,  2.11it/s] 58%|█████████████████████████████████████████████████████████████████▏                                              | 749/1288 [06:50<04:20,  2.07it/s] 58%|█████████████████████████████████████████████████████████████████▏                                              | 750/1288 [06:50<04:24,  2.04it/s]                                                                                                                                                        {'loss': 6.3675, 'grad_norm': 2.90625, 'learning_rate': 0.04068640659057242, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2631.57, 'epoch': 2.32}
 58%|█████████████████████████████████████████████████████████████████▏                                              | 750/1288 [06:50<04:24,  2.04it/s] 58%|█████████████████████████████████████████████████████████████████▎                                              | 751/1288 [06:51<04:33,  1.97it/s] 58%|█████████████████████████████████████████████████████████████████▍                                              | 752/1288 [06:51<04:27,  2.00it/s] 58%|█████████████████████████████████████████████████████████████████▍                                              | 753/1288 [06:52<04:33,  1.95it/s] 59%|█████████████████████████████████████████████████████████████████▌                                              | 754/1288 [06:52<04:33,  1.95it/s] 59%|█████████████████████████████████████████████████████████████████▋                                              | 755/1288 [06:53<04:28,  1.98it/s] 59%|█████████████████████████████████████████████████████████████████▋                                              | 756/1288 [06:53<04:30,  1.97it/s] 59%|█████████████████████████████████████████████████████████████████▊                                              | 757/1288 [06:54<04:29,  1.97it/s] 59%|█████████████████████████████████████████████████████████████████▉                                              | 758/1288 [06:54<04:25,  2.00it/s] 59%|██████████████████████████████████████████████████████████████████                                              | 759/1288 [06:55<04:18,  2.05it/s] 59%|██████████████████████████████████████████████████████████████████                                              | 760/1288 [06:55<04:14,  2.08it/s]                                                                                                                                                        {'loss': 6.4722, 'grad_norm': 3.46875, 'learning_rate': 0.039428742843067736, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2947.9, 'epoch': 2.35}
 59%|██████████████████████████████████████████████████████████████████                                              | 760/1288 [06:55<04:14,  2.08it/s] 59%|██████████████████████████████████████████████████████████████████▏                                             | 761/1288 [06:56<04:17,  2.04it/s] 59%|██████████████████████████████████████████████████████████████████▎                                             | 762/1288 [06:56<04:16,  2.05it/s] 59%|██████████████████████████████████████████████████████████████████▎                                             | 763/1288 [06:57<04:19,  2.02it/s] 59%|██████████████████████████████████████████████████████████████████▍                                             | 764/1288 [06:57<04:15,  2.05it/s] 59%|██████████████████████████████████████████████████████████████████▌                                             | 765/1288 [06:58<04:23,  1.98it/s] 59%|██████████████████████████████████████████████████████████████████▌                                             | 766/1288 [06:58<04:19,  2.01it/s] 60%|██████████████████████████████████████████████████████████████████▋                                             | 767/1288 [06:59<04:30,  1.93it/s] 60%|██████████████████████████████████████████████████████████████████▊                                             | 768/1288 [06:59<04:29,  1.93it/s] 60%|██████████████████████████████████████████████████████████████████▊                                             | 769/1288 [07:00<04:28,  1.93it/s] 60%|██████████████████████████████████████████████████████████████████▉                                             | 770/1288 [07:00<04:27,  1.94it/s]                                                                                                                                                        {'loss': 6.2898, 'grad_norm': 3.03125, 'learning_rate': 0.038178042789986356, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2814.33, 'epoch': 2.38}
 60%|██████████████████████████████████████████████████████████████████▉                                             | 770/1288 [07:00<04:27,  1.94it/s] 60%|███████████████████████████████████████████████████████████████████                                             | 771/1288 [07:01<04:27,  1.93it/s] 60%|███████████████████████████████████████████████████████████████████▏                                            | 772/1288 [07:01<04:32,  1.90it/s] 60%|███████████████████████████████████████████████████████████████████▏                                            | 773/1288 [07:02<04:29,  1.91it/s] 60%|███████████████████████████████████████████████████████████████████▎                                            | 774/1288 [07:02<04:27,  1.92it/s] 60%|███████████████████████████████████████████████████████████████████▍                                            | 775/1288 [07:03<04:26,  1.92it/s] 60%|███████████████████████████████████████████████████████████████████▍                                            | 776/1288 [07:03<04:24,  1.94it/s] 60%|███████████████████████████████████████████████████████████████████▌                                            | 777/1288 [07:04<03:55,  2.17it/s] 60%|███████████████████████████████████████████████████████████████████▋                                            | 778/1288 [07:04<04:07,  2.06it/s] 60%|███████████████████████████████████████████████████████████████████▋                                            | 779/1288 [07:05<04:05,  2.08it/s] 61%|███████████████████████████████████████████████████████████████████▊                                            | 780/1288 [07:05<04:05,  2.07it/s]                                                                                                                                                        {'loss': 6.6719, 'grad_norm': 3.671875, 'learning_rate': 0.036935130315645485, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2790.89, 'epoch': 2.41}
 61%|███████████████████████████████████████████████████████████████████▊                                            | 780/1288 [07:05<04:05,  2.07it/s] 61%|███████████████████████████████████████████████████████████████████▉                                            | 781/1288 [07:06<04:09,  2.03it/s] 61%|████████████████████████████████████████████████████████████████████                                            | 782/1288 [07:06<04:12,  2.00it/s] 61%|████████████████████████████████████████████████████████████████████                                            | 783/1288 [07:07<04:19,  1.95it/s] 61%|████████████████████████████████████████████████████████████████████▏                                           | 784/1288 [07:07<04:17,  1.96it/s] 61%|████████████████████████████████████████████████████████████████████▎                                           | 785/1288 [07:08<04:22,  1.92it/s] 61%|████████████████████████████████████████████████████████████████████▎                                           | 786/1288 [07:08<04:14,  1.97it/s] 61%|████████████████████████████████████████████████████████████████████▍                                           | 787/1288 [07:09<04:09,  2.00it/s] 61%|████████████████████████████████████████████████████████████████████▌                                           | 788/1288 [07:09<04:09,  2.00it/s] 61%|████████████████████████████████████████████████████████████████████▌                                           | 789/1288 [07:10<04:15,  1.95it/s] 61%|████████████████████████████████████████████████████████████████████▋                                           | 790/1288 [07:10<04:32,  1.83it/s]                                                                                                                                                        {'loss': 6.1644, 'grad_norm': 3.015625, 'learning_rate': 0.035700824174384194, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2909.14, 'epoch': 2.45}
 61%|████████████████████████████████████████████████████████████████████▋                                           | 790/1288 [07:10<04:32,  1.83it/s] 61%|████████████████████████████████████████████████████████████████████▊                                           | 791/1288 [07:11<04:30,  1.83it/s] 61%|████████████████████████████████████████████████████████████████████▊                                           | 792/1288 [07:11<04:29,  1.84it/s] 62%|████████████████████████████████████████████████████████████████████▉                                           | 793/1288 [07:12<04:23,  1.88it/s] 62%|█████████████████████████████████████████████████████████████████████                                           | 794/1288 [07:12<04:13,  1.95it/s] 62%|█████████████████████████████████████████████████████████████████████▏                                          | 795/1288 [07:13<04:17,  1.92it/s] 62%|█████████████████████████████████████████████████████████████████████▏                                          | 796/1288 [07:13<04:10,  1.97it/s] 62%|█████████████████████████████████████████████████████████████████████▎                                          | 797/1288 [07:14<04:08,  1.98it/s] 62%|█████████████████████████████████████████████████████████████████████▍                                          | 798/1288 [07:14<04:12,  1.94it/s] 62%|█████████████████████████████████████████████████████████████████████▍                                          | 799/1288 [07:15<04:12,  1.93it/s] 62%|█████████████████████████████████████████████████████████████████████▌                                          | 800/1288 [07:15<04:07,  1.97it/s]                                                                                                                                                        {'loss': 6.1932, 'grad_norm': 3.375, 'learning_rate': 0.034475937451218255, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2991.93, 'epoch': 2.48}
 62%|█████████████████████████████████████████████████████████████████████▌                                          | 800/1288 [07:15<04:07,  1.97it/s][2025-12-13 20:46:29,727] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-800
 62%|█████████████████████████████████████████████████████████████████████▋                                          | 801/1288 [07:19<10:49,  1.33s/it] 62%|█████████████████████████████████████████████████████████████████████▋                                          | 802/1288 [07:19<08:48,  1.09s/it] 62%|█████████████████████████████████████████████████████████████████████▊                                          | 803/1288 [07:20<07:17,  1.11it/s] 62%|█████████████████████████████████████████████████████████████████████▉                                          | 804/1288 [07:20<06:18,  1.28it/s] 62%|██████████████████████████████████████████████████████████████████████                                          | 805/1288 [07:21<05:38,  1.43it/s] 63%|██████████████████████████████████████████████████████████████████████                                          | 806/1288 [07:21<05:06,  1.57it/s] 63%|██████████████████████████████████████████████████████████████████████▏                                         | 807/1288 [07:22<04:41,  1.71it/s] 63%|██████████████████████████████████████████████████████████████████████▎                                         | 808/1288 [07:22<04:30,  1.77it/s] 63%|██████████████████████████████████████████████████████████████████████▎                                         | 809/1288 [07:23<04:15,  1.87it/s] 63%|██████████████████████████████████████████████████████████████████████▍                                         | 810/1288 [07:23<04:16,  1.86it/s]                                                                                                                                                        {'loss': 6.34, 'grad_norm': 2.8125, 'learning_rate': 0.03326127702622985, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2459.23, 'epoch': 2.51}
 63%|██████████████████████████████████████████████████████████████████████▍                                         | 810/1288 [07:23<04:16,  1.86it/s] 63%|██████████████████████████████████████████████████████████████████████▌                                         | 811/1288 [07:24<04:20,  1.83it/s] 63%|██████████████████████████████████████████████████████████████████████▌                                         | 812/1288 [07:24<04:10,  1.90it/s] 63%|██████████████████████████████████████████████████████████████████████▋                                         | 813/1288 [07:25<04:00,  1.97it/s] 63%|██████████████████████████████████████████████████████████████████████▊                                         | 814/1288 [07:25<04:06,  1.92it/s] 63%|██████████████████████████████████████████████████████████████████████▊                                         | 815/1288 [07:26<04:04,  1.93it/s] 63%|██████████████████████████████████████████████████████████████████████▉                                         | 816/1288 [07:26<03:53,  2.02it/s] 63%|███████████████████████████████████████████████████████████████████████                                         | 817/1288 [07:27<03:59,  1.97it/s] 64%|███████████████████████████████████████████████████████████████████████▏                                        | 818/1288 [07:27<03:44,  2.09it/s] 64%|███████████████████████████████████████████████████████████████████████▏                                        | 819/1288 [07:28<03:45,  2.08it/s] 64%|███████████████████████████████████████████████████████████████████████▎                                        | 820/1288 [07:28<03:49,  2.04it/s]                                                                                                                                                        {'loss': 6.2486, 'grad_norm': 3.46875, 'learning_rate': 0.03205764304304445, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2574.92, 'epoch': 2.54}
 64%|███████████████████████████████████████████████████████████████████████▎                                        | 820/1288 [07:28<03:49,  2.04it/s] 64%|███████████████████████████████████████████████████████████████████████▍                                        | 821/1288 [07:29<03:52,  2.01it/s] 64%|███████████████████████████████████████████████████████████████████████▍                                        | 822/1288 [07:29<03:57,  1.96it/s] 64%|███████████████████████████████████████████████████████████████████████▌                                        | 823/1288 [07:30<04:02,  1.92it/s] 64%|███████████████████████████████████████████████████████████████████████▋                                        | 824/1288 [07:30<04:15,  1.82it/s] 64%|███████████████████████████████████████████████████████████████████████▋                                        | 825/1288 [07:31<04:04,  1.90it/s] 64%|███████████████████████████████████████████████████████████████████████▊                                        | 826/1288 [07:31<04:04,  1.89it/s] 64%|███████████████████████████████████████████████████████████████████████▉                                        | 827/1288 [07:32<04:01,  1.91it/s] 64%|████████████████████████████████████████████████████████████████████████                                        | 828/1288 [07:32<03:58,  1.93it/s] 64%|████████████████████████████████████████████████████████████████████████                                        | 829/1288 [07:33<03:51,  1.98it/s] 64%|████████████████████████████████████████████████████████████████████████▏                                       | 830/1288 [07:33<03:56,  1.94it/s]                                                                                                                                                        {'loss': 6.1866, 'grad_norm': 3.390625, 'learning_rate': 0.030865828381745515, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2725.02, 'epoch': 2.57}
 64%|████████████████████████████████████████████████████████████████████████▏                                       | 830/1288 [07:33<03:56,  1.94it/s] 65%|████████████████████████████████████████████████████████████████████████▎                                       | 831/1288 [07:34<03:56,  1.93it/s] 65%|████████████████████████████████████████████████████████████████████████▎                                       | 832/1288 [07:34<03:54,  1.94it/s] 65%|████████████████████████████████████████████████████████████████████████▍                                       | 833/1288 [07:35<03:52,  1.95it/s] 65%|████████████████████████████████████████████████████████████████████████▌                                       | 834/1288 [07:35<03:51,  1.96it/s] 65%|████████████████████████████████████████████████████████████████████████▌                                       | 835/1288 [07:36<03:50,  1.96it/s] 65%|████████████████████████████████████████████████████████████████████████▋                                       | 836/1288 [07:37<03:55,  1.92it/s] 65%|████████████████████████████████████████████████████████████████████████▊                                       | 837/1288 [07:37<03:44,  2.01it/s] 65%|████████████████████████████████████████████████████████████████████████▊                                       | 838/1288 [07:38<03:46,  1.98it/s] 65%|████████████████████████████████████████████████████████████████████████▉                                       | 839/1288 [07:38<03:42,  2.02it/s] 65%|█████████████████████████████████████████████████████████████████████████                                       | 840/1288 [07:38<03:43,  2.00it/s]                                                                                                                                                        {'loss': 6.3355, 'grad_norm': 2.984375, 'learning_rate': 0.029686618136573695, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2675.18, 'epoch': 2.6}
 65%|█████████████████████████████████████████████████████████████████████████                                       | 840/1288 [07:38<03:43,  2.00it/s] 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 841/1288 [07:39<03:48,  1.95it/s] 65%|█████████████████████████████████████████████████████████████████████████▏                                      | 842/1288 [07:39<03:42,  2.00it/s] 65%|█████████████████████████████████████████████████████████████████████████▎                                      | 843/1288 [07:40<03:43,  1.99it/s] 66%|█████████████████████████████████████████████████████████████████████████▍                                      | 844/1288 [07:41<03:42,  1.99it/s] 66%|█████████████████████████████████████████████████████████████████████████▍                                      | 845/1288 [07:41<03:42,  1.99it/s] 66%|█████████████████████████████████████████████████████████████████████████▌                                      | 846/1288 [07:41<03:38,  2.03it/s] 66%|█████████████████████████████████████████████████████████████████████████▋                                      | 847/1288 [07:42<03:15,  2.26it/s] 66%|█████████████████████████████████████████████████████████████████████████▋                                      | 848/1288 [07:42<03:28,  2.11it/s] 66%|█████████████████████████████████████████████████████████████████████████▊                                      | 849/1288 [07:43<03:30,  2.09it/s] 66%|█████████████████████████████████████████████████████████████████████████▉                                      | 850/1288 [07:43<03:33,  2.05it/s]                                                                                                                                                        {'loss': 6.4024, 'grad_norm': 2.796875, 'learning_rate': 0.02852078909875505, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2364.07, 'epoch': 2.63}
 66%|█████████████████████████████████████████████████████████████████████████▉                                      | 850/1288 [07:43<03:33,  2.05it/s] 66%|██████████████████████████████████████████████████████████████████████████                                      | 851/1288 [07:44<03:34,  2.04it/s] 66%|██████████████████████████████████████████████████████████████████████████                                      | 852/1288 [07:44<03:31,  2.06it/s] 66%|██████████████████████████████████████████████████████████████████████████▏                                     | 853/1288 [07:45<03:35,  2.02it/s] 66%|██████████████████████████████████████████████████████████████████████████▎                                     | 854/1288 [07:45<03:28,  2.08it/s] 66%|██████████████████████████████████████████████████████████████████████████▎                                     | 855/1288 [07:46<03:31,  2.04it/s] 66%|██████████████████████████████████████████████████████████████████████████▍                                     | 856/1288 [07:46<03:38,  1.98it/s] 67%|██████████████████████████████████████████████████████████████████████████▌                                     | 857/1288 [07:47<03:35,  2.00it/s] 67%|██████████████████████████████████████████████████████████████████████████▌                                     | 858/1288 [07:47<03:40,  1.95it/s] 67%|██████████████████████████████████████████████████████████████████████████▋                                     | 859/1288 [07:48<03:33,  2.01it/s] 67%|██████████████████████████████████████████████████████████████████████████▊                                     | 860/1288 [07:48<03:37,  1.97it/s]                                                                                                                                                        {'loss': 6.3874, 'grad_norm': 4.5625, 'learning_rate': 0.02736910924479881, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2543.21, 'epoch': 2.66}
 67%|██████████████████████████████████████████████████████████████████████████▊                                     | 860/1288 [07:48<03:37,  1.97it/s] 67%|██████████████████████████████████████████████████████████████████████████▊                                     | 861/1288 [07:49<03:33,  2.00it/s] 67%|██████████████████████████████████████████████████████████████████████████▉                                     | 862/1288 [07:49<03:39,  1.94it/s] 67%|███████████████████████████████████████████████████████████████████████████                                     | 863/1288 [07:50<03:36,  1.96it/s] 67%|███████████████████████████████████████████████████████████████████████████▏                                    | 864/1288 [07:50<03:35,  1.96it/s] 67%|███████████████████████████████████████████████████████████████████████████▏                                    | 865/1288 [07:51<03:40,  1.91it/s] 67%|███████████████████████████████████████████████████████████████████████████▎                                    | 866/1288 [07:51<03:42,  1.89it/s] 67%|███████████████████████████████████████████████████████████████████████████▍                                    | 867/1288 [07:52<03:44,  1.87it/s] 67%|███████████████████████████████████████████████████████████████████████████▍                                    | 868/1288 [07:53<03:36,  1.94it/s] 67%|███████████████████████████████████████████████████████████████████████████▌                                    | 869/1288 [07:53<03:37,  1.92it/s] 68%|███████████████████████████████████████████████████████████████████████████▋                                    | 870/1288 [07:54<03:36,  1.94it/s]                                                                                                                                                        {'loss': 6.42, 'grad_norm': 2.828125, 'learning_rate': 0.026232337230601567, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2932.8, 'epoch': 2.69}
 68%|███████████████████████████████████████████████████████████████████████████▋                                    | 870/1288 [07:54<03:36,  1.94it/s] 68%|███████████████████████████████████████████████████████████████████████████▋                                    | 871/1288 [07:54<03:34,  1.95it/s] 68%|███████████████████████████████████████████████████████████████████████████▊                                    | 872/1288 [07:55<03:36,  1.92it/s] 68%|███████████████████████████████████████████████████████████████████████████▉                                    | 873/1288 [07:55<03:39,  1.89it/s] 68%|████████████████████████████████████████████████████████████████████████████                                    | 874/1288 [07:56<03:37,  1.90it/s] 68%|████████████████████████████████████████████████████████████████████████████                                    | 875/1288 [07:56<03:26,  2.00it/s] 68%|████████████████████████████████████████████████████████████████████████████▏                                   | 876/1288 [07:57<03:28,  1.98it/s] 68%|████████████████████████████████████████████████████████████████████████████▎                                   | 877/1288 [07:57<03:26,  1.99it/s] 68%|████████████████████████████████████████████████████████████████████████████▎                                   | 878/1288 [07:58<03:32,  1.93it/s] 68%|████████████████████████████████████████████████████████████████████████████▍                                   | 879/1288 [07:58<03:27,  1.97it/s] 68%|████████████████████████████████████████████████████████████████████████████▌                                   | 880/1288 [07:59<03:26,  1.98it/s]                                                                                                                                                        {'loss': 6.2999, 'grad_norm': 3.5, 'learning_rate': 0.025111221891691383, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2832.37, 'epoch': 2.72}
 68%|████████████████████████████████████████████████████████████████████████████▌                                   | 880/1288 [07:59<03:26,  1.98it/s] 68%|████████████████████████████████████████████████████████████████████████████▌                                   | 881/1288 [07:59<03:25,  1.98it/s] 68%|████████████████████████████████████████████████████████████████████████████▋                                   | 882/1288 [08:00<03:21,  2.01it/s] 69%|████████████████████████████████████████████████████████████████████████████▊                                   | 883/1288 [08:00<03:15,  2.07it/s] 69%|████████████████████████████████████████████████████████████████████████████▊                                   | 884/1288 [08:00<03:03,  2.20it/s] 69%|████████████████████████████████████████████████████████████████████████████▉                                   | 885/1288 [08:01<03:13,  2.08it/s] 69%|█████████████████████████████████████████████████████████████████████████████                                   | 886/1288 [08:01<03:10,  2.11it/s] 69%|█████████████████████████████████████████████████████████████████████████████▏                                  | 887/1288 [08:02<03:10,  2.10it/s] 69%|█████████████████████████████████████████████████████████████████████████████▏                                  | 888/1288 [08:02<03:14,  2.06it/s] 69%|█████████████████████████████████████████████████████████████████████████████▎                                  | 889/1288 [08:03<03:21,  1.98it/s] 69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 890/1288 [08:04<03:24,  1.94it/s]                                                                                                                                                        {'loss': 6.3203, 'grad_norm': 3.140625, 'learning_rate': 0.024006501749941095, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2402.44, 'epoch': 2.76}
 69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 890/1288 [08:04<03:24,  1.94it/s] 69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 891/1288 [08:04<03:24,  1.95it/s] 69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 892/1288 [08:05<03:19,  1.99it/s] 69%|█████████████████████████████████████████████████████████████████████████████▋                                  | 893/1288 [08:05<03:14,  2.03it/s] 69%|█████████████████████████████████████████████████████████████████████████████▋                                  | 894/1288 [08:06<03:20,  1.97it/s] 69%|█████████████████████████████████████████████████████████████████████████████▊                                  | 895/1288 [08:06<03:19,  1.97it/s] 70%|█████████████████████████████████████████████████████████████████████████████▉                                  | 896/1288 [08:07<03:24,  1.92it/s] 70%|██████████████████████████████████████████████████████████████████████████████                                  | 897/1288 [08:07<03:18,  1.97it/s] 70%|██████████████████████████████████████████████████████████████████████████████                                  | 898/1288 [08:08<03:22,  1.92it/s] 70%|██████████████████████████████████████████████████████████████████████████████▏                                 | 899/1288 [08:08<03:21,  1.93it/s] 70%|██████████████████████████████████████████████████████████████████████████████▎                                 | 900/1288 [08:09<03:16,  1.97it/s]                                                                                                                                                        {'loss': 6.3982, 'grad_norm': 3.15625, 'learning_rate': 0.02291890452707539, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3002.25, 'epoch': 2.79}
 70%|██████████████████████████████████████████████████████████████████████████████▎                                 | 900/1288 [08:09<03:16,  1.97it/s][2025-12-13 20:47:22,867] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-900
 70%|██████████████████████████████████████████████████████████████████████████████▎                                 | 901/1288 [08:12<08:08,  1.26s/it] 70%|██████████████████████████████████████████████████████████████████████████████▍                                 | 902/1288 [08:12<06:35,  1.02s/it] 70%|██████████████████████████████████████████████████████████████████████████████▌                                 | 903/1288 [08:13<05:31,  1.16it/s] 70%|██████████████████████████████████████████████████████████████████████████████▌                                 | 904/1288 [08:13<04:53,  1.31it/s] 70%|██████████████████████████████████████████████████████████████████████████████▋                                 | 905/1288 [08:14<04:24,  1.45it/s] 70%|██████████████████████████████████████████████████████████████████████████████▊                                 | 906/1288 [08:14<04:06,  1.55it/s] 70%|██████████████████████████████████████████████████████████████████████████████▊                                 | 907/1288 [08:15<03:50,  1.65it/s] 70%|██████████████████████████████████████████████████████████████████████████████▉                                 | 908/1288 [08:15<03:42,  1.71it/s] 71%|███████████████████████████████████████████████████████████████████████████████                                 | 909/1288 [08:16<03:29,  1.81it/s] 71%|███████████████████████████████████████████████████████████████████████████████▏                                | 910/1288 [08:16<03:23,  1.86it/s]                                                                                                                                                        {'loss': 6.3465, 'grad_norm': 3.46875, 'learning_rate': 0.021849146665292515, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2833.93, 'epoch': 2.82}
 71%|███████████████████████████████████████████████████████████████████████████████▏                                | 910/1288 [08:16<03:23,  1.86it/s] 71%|███████████████████████████████████████████████████████████████████████████████▏                                | 911/1288 [08:17<03:26,  1.83it/s] 71%|███████████████████████████████████████████████████████████████████████████████▎                                | 912/1288 [08:17<03:24,  1.84it/s] 71%|███████████████████████████████████████████████████████████████████████████████▍                                | 913/1288 [08:18<03:21,  1.86it/s] 71%|███████████████████████████████████████████████████████████████████████████████▍                                | 914/1288 [08:18<03:22,  1.85it/s] 71%|███████████████████████████████████████████████████████████████████████████████▌                                | 915/1288 [08:19<03:18,  1.88it/s] 71%|███████████████████████████████████████████████████████████████████████████████▋                                | 916/1288 [08:19<03:16,  1.90it/s] 71%|███████████████████████████████████████████████████████████████████████████████▋                                | 917/1288 [08:20<03:13,  1.92it/s] 71%|███████████████████████████████████████████████████████████████████████████████▊                                | 918/1288 [08:20<03:07,  1.98it/s] 71%|███████████████████████████████████████████████████████████████████████████████▉                                | 919/1288 [08:21<03:06,  1.98it/s] 71%|████████████████████████████████████████████████████████████████████████████████                                | 920/1288 [08:21<03:01,  2.02it/s]                                                                                                                                                        {'loss': 6.1632, 'grad_norm': 3.015625, 'learning_rate': 0.020797932855316184, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3115.38, 'epoch': 2.85}
 71%|████████████████████████████████████████████████████████████████████████████████                                | 920/1288 [08:21<03:01,  2.02it/s] 72%|████████████████████████████████████████████████████████████████████████████████                                | 921/1288 [08:22<03:04,  1.99it/s] 72%|████████████████████████████████████████████████████████████████████████████████▏                               | 922/1288 [08:22<03:01,  2.02it/s] 72%|████████████████████████████████████████████████████████████████████████████████▎                               | 923/1288 [08:23<03:08,  1.93it/s] 72%|████████████████████████████████████████████████████████████████████████████████▎                               | 924/1288 [08:23<03:04,  1.98it/s] 72%|████████████████████████████████████████████████████████████████████████████████▍                               | 925/1288 [08:24<03:07,  1.93it/s] 72%|████████████████████████████████████████████████████████████████████████████████▌                               | 926/1288 [08:25<03:07,  1.93it/s] 72%|████████████████████████████████████████████████████████████████████████████████▌                               | 927/1288 [08:25<03:09,  1.91it/s] 72%|████████████████████████████████████████████████████████████████████████████████▋                               | 928/1288 [08:26<03:07,  1.92it/s] 72%|████████████████████████████████████████████████████████████████████████████████▊                               | 929/1288 [08:26<03:09,  1.89it/s] 72%|████████████████████████████████████████████████████████████████████████████████▊                               | 930/1288 [08:27<03:10,  1.88it/s]                                                                                                                                                        {'loss': 6.0676, 'grad_norm': 3.015625, 'learning_rate': 0.019765955572188576, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2862.8, 'epoch': 2.88}
 72%|████████████████████████████████████████████████████████████████████████████████▊                               | 930/1288 [08:27<03:10,  1.88it/s] 72%|████████████████████████████████████████████████████████████████████████████████▉                               | 931/1288 [08:27<03:11,  1.87it/s] 72%|█████████████████████████████████████████████████████████████████████████████████                               | 932/1288 [08:28<03:08,  1.89it/s] 72%|█████████████████████████████████████████████████████████████████████████████████▏                              | 933/1288 [08:28<03:02,  1.94it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▏                              | 934/1288 [08:29<02:58,  1.99it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▎                              | 935/1288 [08:29<02:55,  2.01it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▍                              | 936/1288 [08:30<02:53,  2.03it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▍                              | 937/1288 [08:30<02:55,  2.00it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▌                              | 938/1288 [08:31<02:52,  2.03it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▋                              | 939/1288 [08:31<02:48,  2.07it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▋                              | 940/1288 [08:32<02:46,  2.09it/s]                                                                                                                                                        {'loss': 6.4408, 'grad_norm': 3.046875, 'learning_rate': 0.018753894619110548, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2776.57, 'epoch': 2.91}
 73%|█████████████████████████████████████████████████████████████████████████████████▋                              | 940/1288 [08:32<02:46,  2.09it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▊                              | 941/1288 [08:32<02:51,  2.02it/s] 73%|█████████████████████████████████████████████████████████████████████████████████▉                              | 942/1288 [08:33<02:56,  1.97it/s] 73%|██████████████████████████████████████████████████████████████████████████████████                              | 943/1288 [08:33<02:52,  2.00it/s] 73%|██████████████████████████████████████████████████████████████████████████████████                              | 944/1288 [08:34<02:53,  1.98it/s] 73%|██████████████████████████████████████████████████████████████████████████████████▏                             | 945/1288 [08:34<02:55,  1.95it/s] 73%|██████████████████████████████████████████████████████████████████████████████████▎                             | 946/1288 [08:35<02:55,  1.95it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▎                             | 947/1288 [08:35<02:55,  1.94it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▍                             | 948/1288 [08:36<02:54,  1.95it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▌                             | 949/1288 [08:36<02:44,  2.06it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▌                             | 950/1288 [08:37<02:50,  1.98it/s]                                                                                                                                                        {'loss': 6.1035, 'grad_norm': 3.484375, 'learning_rate': 0.01776241667962879, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2635.97, 'epoch': 2.94}
 74%|██████████████████████████████████████████████████████████████████████████████████▌                             | 950/1288 [08:37<02:50,  1.98it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▋                             | 951/1288 [08:37<02:46,  2.02it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▊                             | 952/1288 [08:38<02:41,  2.08it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▊                             | 953/1288 [08:38<02:43,  2.06it/s] 74%|██████████████████████████████████████████████████████████████████████████████████▉                             | 954/1288 [08:39<02:40,  2.07it/s] 74%|███████████████████████████████████████████████████████████████████████████████████                             | 955/1288 [08:39<02:40,  2.08it/s] 74%|███████████████████████████████████████████████████████████████████████████████████▏                            | 956/1288 [08:39<02:35,  2.14it/s] 74%|███████████████████████████████████████████████████████████████████████████████████▏                            | 957/1288 [08:40<02:35,  2.13it/s] 74%|███████████████████████████████████████████████████████████████████████████████████▎                            | 958/1288 [08:40<02:41,  2.04it/s] 74%|███████████████████████████████████████████████████████████████████████████████████▍                            | 959/1288 [08:41<02:46,  1.97it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▍                            | 960/1288 [08:42<02:48,  1.94it/s]                                                                                                                                                        {'loss': 5.8823, 'grad_norm': 3.5625, 'learning_rate': 0.016792174878465932, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2494.38, 'epoch': 2.97}
 75%|███████████████████████████████████████████████████████████████████████████████████▍                            | 960/1288 [08:42<02:48,  1.94it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▌                            | 961/1288 [08:42<02:51,  1.91it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▋                            | 962/1288 [08:43<02:44,  1.98it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▋                            | 963/1288 [08:43<02:30,  2.16it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▊                            | 964/1288 [08:43<02:35,  2.08it/s] 75%|███████████████████████████████████████████████████████████████████████████████████▉                            | 965/1288 [08:44<02:37,  2.05it/s] 75%|████████████████████████████████████████████████████████████████████████████████████                            | 966/1288 [08:44<02:36,  2.06it/s] 75%|████████████████████████████████████████████████████████████████████████████████████                            | 967/1288 [08:45<02:31,  2.12it/s] 75%|████████████████████████████████████████████████████████████████████████████████████▏                           | 968/1288 [08:45<02:37,  2.03it/s] 75%|████████████████████████████████████████████████████████████████████████████████████▎                           | 969/1288 [08:46<02:30,  2.12it/s] 75%|████████████████████████████████████████████████████████████████████████████████████▎                           | 970/1288 [08:47<04:15,  1.25it/s]                                                                                                                                                        {'loss': 6.4021, 'grad_norm': 3.4375, 'learning_rate': 0.015843808351281914, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2460.65, 'epoch': 3.0}
 75%|████████████████████████████████████████████████████████████████████████████████████▎                           | 970/1288 [08:47<04:15,  1.25it/s] 75%|████████████████████████████████████████████████████████████████████████████████████▍                           | 971/1288 [08:48<03:43,  1.42it/s] 75%|████████████████████████████████████████████████████████████████████████████████████▌                           | 972/1288 [08:48<03:23,  1.55it/s] 76%|████████████████████████████████████████████████████████████████████████████████████▌                           | 973/1288 [08:49<03:06,  1.69it/s] 76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 974/1288 [08:49<02:54,  1.80it/s] 76%|████████████████████████████████████████████████████████████████████████████████████▊                           | 975/1288 [08:50<02:45,  1.89it/s] 76%|████████████████████████████████████████████████████████████████████████████████████▊                           | 976/1288 [08:50<02:46,  1.88it/s] 76%|████████████████████████████████████████████████████████████████████████████████████▉                           | 977/1288 [08:51<02:43,  1.91it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████                           | 978/1288 [08:51<02:37,  1.97it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▏                          | 979/1288 [08:52<02:37,  1.96it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▏                          | 980/1288 [08:52<02:37,  1.96it/s]                                                                                                                                                        {'loss': 6.2481, 'grad_norm': 3.421875, 'learning_rate': 0.014917941823650915, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2553.72, 'epoch': 3.03}
 76%|█████████████████████████████████████████████████████████████████████████████████████▏                          | 980/1288 [08:52<02:37,  1.96it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▎                          | 981/1288 [08:53<02:36,  1.96it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▍                          | 982/1288 [08:53<02:32,  2.01it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▍                          | 983/1288 [08:54<02:32,  2.00it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▌                          | 984/1288 [08:54<02:32,  1.99it/s] 76%|█████████████████████████████████████████████████████████████████████████████████████▋                          | 985/1288 [08:55<02:30,  2.02it/s] 77%|█████████████████████████████████████████████████████████████████████████████████████▋                          | 986/1288 [08:55<02:32,  1.98it/s] 77%|█████████████████████████████████████████████████████████████████████████████████████▊                          | 987/1288 [08:56<02:31,  1.98it/s] 77%|█████████████████████████████████████████████████████████████████████████████████████▉                          | 988/1288 [08:56<02:24,  2.07it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████                          | 989/1288 [08:57<02:29,  2.00it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████                          | 990/1288 [08:57<02:29,  1.99it/s]                                                                                                                                                        {'loss': 6.4088, 'grad_norm': 3.609375, 'learning_rate': 0.014015185199530378, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2636.23, 'epoch': 3.07}
 77%|██████████████████████████████████████████████████████████████████████████████████████                          | 990/1288 [08:57<02:29,  1.99it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▏                         | 991/1288 [08:58<02:32,  1.94it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▎                         | 992/1288 [08:58<02:34,  1.92it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▎                         | 993/1288 [08:59<02:27,  2.01it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▍                         | 994/1288 [08:59<02:24,  2.04it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▌                         | 995/1288 [09:00<02:27,  1.99it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▌                         | 996/1288 [09:00<02:26,  1.99it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▋                         | 997/1288 [09:01<02:26,  1.98it/s] 77%|██████████████████████████████████████████████████████████████████████████████████████▊                         | 998/1288 [09:01<02:24,  2.01it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▊                         | 999/1288 [09:02<02:24,  2.00it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▏                        | 1000/1288 [09:02<02:25,  1.98it/s]                                                                                                                                                        {'loss': 6.2442, 'grad_norm': 3.46875, 'learning_rate': 0.013136133159493801, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2650.12, 'epoch': 3.1}
 78%|██████████████████████████████████████████████████████████████████████████████████████▏                        | 1000/1288 [09:02<02:25,  1.98it/s][2025-12-13 20:48:16,595] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-1000
 78%|██████████████████████████████████████████████████████████████████████████████████████▎                        | 1001/1288 [09:05<06:07,  1.28s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████▎                        | 1002/1288 [09:06<04:56,  1.04s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████▍                        | 1003/1288 [09:06<04:07,  1.15it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▌                        | 1004/1288 [09:07<03:32,  1.34it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▌                        | 1005/1288 [09:07<03:13,  1.46it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▋                        | 1006/1288 [09:08<03:00,  1.56it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▊                        | 1007/1288 [09:08<02:48,  1.67it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▊                        | 1008/1288 [09:09<02:37,  1.78it/s] 78%|██████████████████████████████████████████████████████████████████████████████████████▉                        | 1009/1288 [09:09<02:35,  1.79it/s] 78%|███████████████████████████████████████████████████████████████████████████████████████                        | 1010/1288 [09:10<02:33,  1.81it/s]                                                                                                                                                        {'loss': 6.2945, 'grad_norm': 3.65625, 'learning_rate': 0.012281364768991804, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2553.0, 'epoch': 3.13}
 78%|███████████████████████████████████████████████████████████████████████████████████████                        | 1010/1288 [09:10<02:33,  1.81it/s] 78%|███████████████████████████████████████████████████████████████████████████████████████▏                       | 1011/1288 [09:11<02:33,  1.81it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▏                       | 1012/1288 [09:11<02:29,  1.85it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▎                       | 1013/1288 [09:12<02:26,  1.88it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▍                       | 1014/1288 [09:12<02:21,  1.94it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▍                       | 1015/1288 [09:13<02:23,  1.91it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▌                       | 1016/1288 [09:13<02:21,  1.93it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▋                       | 1017/1288 [09:14<02:16,  1.98it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▋                       | 1018/1288 [09:14<02:16,  1.98it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▊                       | 1019/1288 [09:15<02:13,  2.01it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▉                       | 1020/1288 [09:15<02:13,  2.01it/s]                                                                                                                                                        {'loss': 6.4189, 'grad_norm': 3.140625, 'learning_rate': 0.01145144309689934, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2812.43, 'epoch': 3.16}
 79%|███████████████████████████████████████████████████████████████████████████████████████▉                       | 1020/1288 [09:15<02:13,  2.01it/s] 79%|███████████████████████████████████████████████████████████████████████████████████████▉                       | 1021/1288 [09:16<02:15,  1.96it/s] 79%|████████████████████████████████████████████████████████████████████████████████████████                       | 1022/1288 [09:16<02:15,  1.96it/s] 79%|████████████████████████████████████████████████████████████████████████████████████████▏                      | 1023/1288 [09:17<02:11,  2.01it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▏                      | 1024/1288 [09:17<02:14,  1.96it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▎                      | 1025/1288 [09:18<02:14,  1.95it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▍                      | 1026/1288 [09:18<02:13,  1.96it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▌                      | 1027/1288 [09:19<02:10,  2.00it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▌                      | 1028/1288 [09:19<02:13,  1.95it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▋                      | 1029/1288 [09:20<02:10,  1.99it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▊                      | 1030/1288 [09:20<02:04,  2.08it/s]                                                                                                                                                        {'loss': 6.2488, 'grad_norm': 3.28125, 'learning_rate': 0.010646914844600543, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3256.86, 'epoch': 3.19}
 80%|████████████████████████████████████████████████████████████████████████████████████████▊                      | 1030/1288 [09:20<02:04,  2.08it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▊                      | 1031/1288 [09:21<02:04,  2.06it/s] 80%|████████████████████████████████████████████████████████████████████████████████████████▉                      | 1032/1288 [09:21<02:03,  2.07it/s] 80%|█████████████████████████████████████████████████████████████████████████████████████████                      | 1033/1288 [09:22<02:05,  2.03it/s] 80%|█████████████████████████████████████████████████████████████████████████████████████████                      | 1034/1288 [09:22<02:09,  1.97it/s] 80%|█████████████████████████████████████████████████████████████████████████████████████████▏                     | 1035/1288 [09:23<02:07,  1.98it/s] 80%|█████████████████████████████████████████████████████████████████████████████████████████▎                     | 1036/1288 [09:23<02:07,  1.98it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▎                     | 1037/1288 [09:24<02:04,  2.01it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▍                     | 1038/1288 [09:24<02:05,  1.99it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▌                     | 1039/1288 [09:25<02:03,  2.02it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▋                     | 1040/1288 [09:25<02:02,  2.02it/s]                                                                                                                                                        {'loss': 6.1305, 'grad_norm': 5.375, 'learning_rate': 0.009868309985855445, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2776.15, 'epoch': 3.22}
 81%|█████████████████████████████████████████████████████████████████████████████████████████▋                     | 1040/1288 [09:25<02:02,  2.02it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▋                     | 1041/1288 [09:26<02:02,  2.01it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▊                     | 1042/1288 [09:26<02:01,  2.03it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▉                     | 1043/1288 [09:27<02:01,  2.02it/s] 81%|█████████████████████████████████████████████████████████████████████████████████████████▉                     | 1044/1288 [09:27<02:02,  2.00it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████                     | 1045/1288 [09:28<02:02,  1.98it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 1046/1288 [09:28<02:02,  1.98it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 1047/1288 [09:29<02:01,  1.98it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████▎                    | 1048/1288 [09:29<02:03,  1.94it/s] 81%|██████████████████████████████████████████████████████████████████████████████████████████▍                    | 1049/1288 [09:30<02:03,  1.94it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▍                    | 1050/1288 [09:30<02:01,  1.95it/s]                                                                                                                                                        {'loss': 6.547, 'grad_norm': 3.203125, 'learning_rate': 0.009116141417685898, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2870.03, 'epoch': 3.25}
 82%|██████████████████████████████████████████████████████████████████████████████████████████▍                    | 1050/1288 [09:30<02:01,  1.95it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▌                    | 1051/1288 [09:31<02:00,  1.96it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▋                    | 1052/1288 [09:31<01:58,  2.00it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▋                    | 1053/1288 [09:32<01:55,  2.03it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▊                    | 1054/1288 [09:32<01:56,  2.00it/s] 82%|██████████████████████████████████████████████████████████████████████████████████████████▉                    | 1055/1288 [09:33<01:54,  2.04it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████                    | 1056/1288 [09:33<01:55,  2.01it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████                    | 1057/1288 [09:34<01:57,  1.96it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████▏                   | 1058/1288 [09:34<01:55,  2.00it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████▎                   | 1059/1288 [09:35<01:57,  1.95it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████▎                   | 1060/1288 [09:35<01:56,  1.96it/s]                                                                                                                                                        {'loss': 6.2881, 'grad_norm': 3.515625, 'learning_rate': 0.00839090462251047, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2688.05, 'epoch': 3.28}
 82%|███████████████████████████████████████████████████████████████████████████████████████████▎                   | 1060/1288 [09:35<01:56,  1.96it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████▍                   | 1061/1288 [09:36<01:58,  1.92it/s] 82%|███████████████████████████████████████████████████████████████████████████████████████████▌                   | 1062/1288 [09:36<01:52,  2.01it/s] 83%|███████████████████████████████████████████████████████████████████████████████████████████▌                   | 1063/1288 [09:37<01:56,  1.93it/s] 83%|███████████████████████████████████████████████████████████████████████████████████████████▋                   | 1064/1288 [09:37<01:53,  1.98it/s] 83%|███████████████████████████████████████████████████████████████████████████████████████████▊                   | 1065/1288 [09:38<01:56,  1.91it/s] 83%|███████████████████████████████████████████████████████████████████████████████████████████▊                   | 1066/1288 [09:38<01:55,  1.93it/s] 83%|███████████████████████████████████████████████████████████████████████████████████████████▉                   | 1067/1288 [09:39<01:56,  1.89it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████                   | 1068/1288 [09:39<01:57,  1.87it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▏                  | 1069/1288 [09:40<01:54,  1.90it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▏                  | 1070/1288 [09:40<01:56,  1.87it/s]                                                                                                                                                        {'loss': 6.0413, 'grad_norm': 3.078125, 'learning_rate': 0.007693077341751137, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2977.69, 'epoch': 3.31}
 83%|████████████████████████████████████████████████████████████████████████████████████████████▏                  | 1070/1288 [09:40<01:56,  1.87it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▎                  | 1071/1288 [09:41<01:51,  1.95it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▍                  | 1072/1288 [09:41<01:53,  1.91it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▍                  | 1073/1288 [09:42<01:52,  1.92it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▌                  | 1074/1288 [09:42<01:50,  1.94it/s] 83%|████████████████████████████████████████████████████████████████████████████████████████████▋                  | 1075/1288 [09:43<01:49,  1.95it/s] 84%|████████████████████████████████████████████████████████████████████████████████████████████▋                  | 1076/1288 [09:44<01:50,  1.91it/s] 84%|████████████████████████████████████████████████████████████████████████████████████████████▊                  | 1077/1288 [09:44<01:45,  2.01it/s] 84%|████████████████████████████████████████████████████████████████████████████████████████████▉                  | 1078/1288 [09:45<01:47,  1.95it/s] 84%|████████████████████████████████████████████████████████████████████████████████████████████▉                  | 1079/1288 [09:45<01:44,  1.99it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████                  | 1080/1288 [09:46<01:48,  1.92it/s]                                                                                                                                                        {'loss': 6.3148, 'grad_norm': 2.890625, 'learning_rate': 0.00702311926112657, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2583.76, 'epoch': 3.34}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████                  | 1080/1288 [09:46<01:48,  1.92it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▏                 | 1081/1288 [09:46<01:45,  1.96it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▏                 | 1082/1288 [09:47<01:45,  1.95it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▎                 | 1083/1288 [09:47<01:42,  2.01it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▍                 | 1084/1288 [09:48<01:41,  2.00it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▌                 | 1085/1288 [09:48<01:39,  2.04it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▌                 | 1086/1288 [09:48<01:38,  2.05it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▋                 | 1087/1288 [09:49<01:38,  2.04it/s] 84%|█████████████████████████████████████████████████████████████████████████████████████████████▊                 | 1088/1288 [09:49<01:37,  2.06it/s] 85%|█████████████████████████████████████████████████████████████████████████████████████████████▊                 | 1089/1288 [09:50<01:37,  2.04it/s] 85%|█████████████████████████████████████████████████████████████████████████████████████████████▉                 | 1090/1288 [09:50<01:37,  2.03it/s]                                                                                                                                                        {'loss': 6.4992, 'grad_norm': 3.625, 'learning_rate': 0.006381471707839448, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2566.24, 'epoch': 3.37}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████▉                 | 1090/1288 [09:50<01:37,  2.03it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████                 | 1091/1288 [09:51<01:40,  1.97it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████                 | 1092/1288 [09:52<01:43,  1.90it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▏                | 1093/1288 [09:52<01:39,  1.96it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▎                | 1094/1288 [09:53<01:41,  1.91it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▎                | 1095/1288 [09:53<01:43,  1.87it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▍                | 1096/1288 [09:54<01:43,  1.85it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▌                | 1097/1288 [09:54<01:43,  1.84it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▋                | 1098/1288 [09:55<01:41,  1.87it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▋                | 1099/1288 [09:55<01:39,  1.91it/s] 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                | 1100/1288 [09:56<01:35,  1.96it/s]                                                                                                                                                        {'loss': 6.2456, 'grad_norm': 3.1875, 'learning_rate': 0.005768557359857241, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3296.91, 'epoch': 3.41}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████▊                | 1100/1288 [09:56<01:35,  1.96it/s][2025-12-13 20:49:09,967] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-1100
 85%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 1101/1288 [09:59<03:57,  1.27s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 1102/1288 [09:59<03:13,  1.04s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████                | 1103/1288 [10:00<02:43,  1.13it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▏               | 1104/1288 [10:00<02:23,  1.28it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▏               | 1105/1288 [10:01<02:07,  1.43it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▎               | 1106/1288 [10:01<01:54,  1.59it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▍               | 1107/1288 [10:02<01:47,  1.69it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▍               | 1108/1288 [10:02<01:42,  1.76it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▌               | 1109/1288 [10:03<01:36,  1.85it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▋               | 1110/1288 [10:03<01:36,  1.85it/s]                                                                                                                                                        {'loss': 6.3436, 'grad_norm': 2.9375, 'learning_rate': 0.005184779967477893, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2581.78, 'epoch': 3.44}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████▋               | 1110/1288 [10:03<01:36,  1.85it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▋               | 1111/1288 [10:04<01:36,  1.84it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▊               | 1112/1288 [10:04<01:32,  1.90it/s] 86%|███████████████████████████████████████████████████████████████████████████████████████████████▉               | 1113/1288 [10:05<01:32,  1.89it/s] 86%|████████████████████████████████████████████████████████████████████████████████████████████████               | 1114/1288 [10:05<01:27,  2.00it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████               | 1115/1288 [10:06<01:28,  1.95it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▏              | 1116/1288 [10:06<01:26,  1.99it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▎              | 1117/1288 [10:07<01:25,  1.99it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▎              | 1118/1288 [10:07<01:27,  1.95it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▍              | 1119/1288 [10:08<01:24,  2.01it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1120/1288 [10:08<01:25,  1.96it/s]                                                                                                                                                        {'loss': 6.113, 'grad_norm': 3.9375, 'learning_rate': 0.004630524087364018, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2644.42, 'epoch': 3.47}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1120/1288 [10:08<01:25,  1.96it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1121/1288 [10:09<01:24,  1.97it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▋              | 1122/1288 [10:09<01:23,  2.00it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▊              | 1123/1288 [10:10<01:22,  1.99it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▊              | 1124/1288 [10:10<01:23,  1.95it/s] 87%|████████████████████████████████████████████████████████████████████████████████████████████████▉              | 1125/1288 [10:11<01:24,  1.93it/s] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████              | 1126/1288 [10:11<01:21,  1.98it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▏             | 1127/1288 [10:12<01:22,  1.95it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▏             | 1128/1288 [10:13<01:22,  1.94it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▎             | 1129/1288 [10:13<01:19,  2.01it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1130/1288 [10:13<01:20,  1.97it/s]                                                                                                                                                        {'loss': 6.1765, 'grad_norm': 3.921875, 'learning_rate': 0.00410615482922056, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2616.84, 'epoch': 3.5}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1130/1288 [10:13<01:20,  1.97it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1131/1288 [10:14<01:16,  2.05it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▌             | 1132/1288 [10:14<01:14,  2.08it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▋             | 1133/1288 [10:15<01:17,  2.01it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▋             | 1134/1288 [10:15<01:16,  2.01it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▊             | 1135/1288 [10:16<01:17,  1.97it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▉             | 1136/1288 [10:16<01:15,  2.01it/s] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▉             | 1137/1288 [10:17<01:13,  2.05it/s] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████             | 1138/1288 [10:17<01:15,  1.99it/s] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1139/1288 [10:18<01:14,  2.00it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1140/1288 [10:18<01:12,  2.04it/s]                                                                                                                                                        {'loss': 6.2806, 'grad_norm': 3.453125, 'learning_rate': 0.0036120176152829643, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2764.65, 'epoch': 3.53}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1140/1288 [10:18<01:12,  2.04it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▎            | 1141/1288 [10:19<01:15,  1.94it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▍            | 1142/1288 [10:19<01:14,  1.95it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▌            | 1143/1288 [10:20<01:15,  1.92it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▌            | 1144/1288 [10:21<01:15,  1.92it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▋            | 1145/1288 [10:21<01:13,  1.95it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▊            | 1146/1288 [10:22<01:14,  1.92it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▊            | 1147/1288 [10:22<01:14,  1.89it/s] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████▉            | 1148/1288 [10:23<01:13,  1.92it/s] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████            | 1149/1288 [10:23<01:12,  1.91it/s] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████            | 1150/1288 [10:24<01:12,  1.91it/s]                                                                                                                                                        {'loss': 6.207, 'grad_norm': 3.671875, 'learning_rate': 0.003148437952774275, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2995.33, 'epoch': 3.56}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████            | 1150/1288 [10:24<01:12,  1.91it/s] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████▏           | 1151/1288 [10:24<01:10,  1.96it/s] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████▎           | 1152/1288 [10:25<01:09,  1.96it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▎           | 1153/1288 [10:25<01:09,  1.95it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▍           | 1154/1288 [10:26<01:08,  1.95it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 1155/1288 [10:26<01:09,  1.91it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 1156/1288 [10:27<01:08,  1.92it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▋           | 1157/1288 [10:27<01:07,  1.93it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▊           | 1158/1288 [10:28<01:05,  1.99it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1159/1288 [10:28<01:02,  2.06it/s] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1160/1288 [10:29<01:02,  2.03it/s]                                                                                                                                                        {'loss': 6.371, 'grad_norm': 3.578125, 'learning_rate': 0.00271572121948091, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2730.62, 'epoch': 3.59}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1160/1288 [10:29<01:02,  2.03it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████           | 1161/1288 [10:29<01:03,  2.00it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 1162/1288 [10:30<01:05,  1.92it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 1163/1288 [10:30<01:03,  1.97it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 1164/1288 [10:31<01:03,  1.97it/s] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 1165/1288 [10:31<01:01,  2.00it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 1166/1288 [10:32<00:59,  2.04it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 1167/1288 [10:32<00:59,  2.02it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 1168/1288 [10:33<01:01,  1.96it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 1169/1288 [10:33<01:01,  1.93it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 1170/1288 [10:34<01:00,  1.94it/s]                                                                                                                                                        {'loss': 6.3744, 'grad_norm': 2.984375, 'learning_rate': 0.002314152462588659, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2838.63, 'epoch': 3.62}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 1170/1288 [10:34<01:00,  1.94it/s] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 1171/1288 [10:34<01:00,  1.95it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████          | 1172/1288 [10:35<00:59,  1.96it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████          | 1173/1288 [10:35<01:03,  1.81it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 1174/1288 [10:36<00:57,  1.98it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 1175/1288 [10:36<00:58,  1.93it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 1176/1288 [10:37<00:58,  1.93it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 1177/1288 [10:37<00:56,  1.97it/s] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1178/1288 [10:38<00:56,  1.94it/s] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1179/1288 [10:38<00:55,  1.98it/s] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1180/1288 [10:39<00:53,  2.01it/s]                                                                                                                                                        {'loss': 6.1967, 'grad_norm': 4.25, 'learning_rate': 0.0019439962109110032, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2915.45, 'epoch': 3.65}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1180/1288 [10:39<00:53,  2.01it/s] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 1181/1288 [10:39<00:53,  2.01it/s] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 1182/1288 [10:40<00:53,  1.99it/s] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 1183/1288 [10:40<00:52,  2.00it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████         | 1184/1288 [10:41<00:52,  1.98it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████         | 1185/1288 [10:41<00:52,  1.97it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 1186/1288 [10:42<00:52,  1.96it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 1187/1288 [10:42<00:51,  1.96it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 1188/1288 [10:43<00:50,  1.99it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 1189/1288 [10:43<00:48,  2.02it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 1190/1288 [10:44<00:49,  1.97it/s]                                                                                                                                                        {'loss': 6.2772, 'grad_norm': 3.8125, 'learning_rate': 0.0016054963006338742, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2576.52, 'epoch': 3.68}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 1190/1288 [10:44<00:49,  1.97it/s] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1191/1288 [10:44<00:48,  2.02it/s] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1192/1288 [10:45<00:46,  2.04it/s] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 1193/1288 [10:45<00:47,  2.01it/s] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 1194/1288 [10:46<00:46,  2.03it/s] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 1195/1288 [10:46<00:45,  2.06it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████        | 1196/1288 [10:47<00:45,  2.03it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 1197/1288 [10:47<00:46,  1.97it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 1198/1288 [10:48<00:45,  1.97it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 1199/1288 [10:48<00:45,  1.97it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 1200/1288 [10:49<00:44,  1.97it/s]                                                                                                                                                        {'loss': 6.2284, 'grad_norm': 3.875, 'learning_rate': 0.0012988757146913223, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2688.72, 'epoch': 3.72}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 1200/1288 [10:49<00:44,  1.97it/s][2025-12-13 20:50:03,207] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-1200
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 1201/1288 [10:52<01:48,  1.25s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 1202/1288 [10:52<01:27,  1.02s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 1203/1288 [10:53<01:14,  1.14it/s] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 1204/1288 [10:53<01:04,  1.31it/s] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 1205/1288 [10:54<00:57,  1.44it/s] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 1206/1288 [10:55<00:52,  1.57it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████       | 1207/1288 [10:55<00:47,  1.70it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████       | 1208/1288 [10:56<00:45,  1.77it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 1209/1288 [10:56<00:42,  1.88it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1210/1288 [10:56<00:40,  1.93it/s]                                                                                                                                                        {'loss': 6.3279, 'grad_norm': 3.28125, 'learning_rate': 0.0010243364358780815, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2789.43, 'epoch': 3.75}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1210/1288 [10:56<00:40,  1.93it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1211/1288 [10:57<00:38,  1.99it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 1212/1288 [10:57<00:38,  1.98it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 1213/1288 [10:58<00:37,  1.98it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 1214/1288 [10:58<00:37,  1.98it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 1215/1288 [10:59<00:37,  1.95it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 1216/1288 [10:59<00:37,  1.94it/s] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 1217/1288 [11:00<00:37,  1.91it/s] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 1218/1288 [11:01<00:36,  1.94it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████      | 1219/1288 [11:01<00:36,  1.90it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 1220/1288 [11:02<00:36,  1.88it/s]                                                                                                                                                        {'loss': 6.0937, 'grad_norm': 3.140625, 'learning_rate': 0.0007820593137957244, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2695.28, 'epoch': 3.78}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 1220/1288 [11:02<00:36,  1.88it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 1221/1288 [11:02<00:34,  1.94it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 1222/1288 [11:03<00:33,  1.96it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 1223/1288 [11:03<00:33,  1.93it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 1224/1288 [11:04<00:32,  1.94it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 1225/1288 [11:04<00:32,  1.96it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 1226/1288 [11:05<00:31,  1.95it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 1227/1288 [11:05<00:31,  1.92it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 1228/1288 [11:06<00:31,  1.93it/s] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 1229/1288 [11:06<00:29,  1.97it/s] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1230/1288 [11:07<00:28,  2.02it/s]                                                                                                                                                        {'loss': 6.5538, 'grad_norm': 3.453125, 'learning_rate': 0.0005722039457200234, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2891.73, 'epoch': 3.81}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1230/1288 [11:07<00:28,  2.02it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1231/1288 [11:07<00:28,  1.99it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 1232/1288 [11:08<00:27,  2.02it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 1233/1288 [11:08<00:26,  2.04it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 1234/1288 [11:09<00:26,  2.02it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 1235/1288 [11:09<00:25,  2.05it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 1236/1288 [11:09<00:22,  2.28it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 1237/1288 [11:10<00:22,  2.29it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 1238/1288 [11:11<00:24,  2.00it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 1239/1288 [11:11<00:24,  2.01it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 1240/1288 [11:12<00:24,  1.98it/s]                                                                                                                                                        {'loss': 6.3883, 'grad_norm': 2.578125, 'learning_rate': 0.0003949085714681389, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2283.22, 'epoch': 3.84}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 1240/1288 [11:12<00:24,  1.98it/s] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 1241/1288 [11:12<00:23,  1.97it/s] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████    | 1242/1288 [11:13<00:22,  2.01it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████    | 1243/1288 [11:13<00:22,  2.04it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 1244/1288 [11:14<00:21,  2.01it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 1245/1288 [11:14<00:20,  2.08it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1246/1288 [11:14<00:20,  2.09it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1247/1288 [11:15<00:19,  2.06it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 1248/1288 [11:16<00:20,  1.93it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 1249/1288 [11:16<00:20,  1.93it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 1250/1288 [11:17<00:19,  1.96it/s]                                                                                                                                                        {'loss': 6.1749, 'grad_norm': 3.484375, 'learning_rate': 0.0002502899823346727, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2713.55, 'epoch': 3.87}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 1250/1288 [11:17<00:19,  1.96it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 1251/1288 [11:17<00:19,  1.93it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 1252/1288 [11:18<00:18,  1.90it/s] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 1253/1288 [11:18<00:17,  1.99it/s] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 1254/1288 [11:19<00:17,  1.99it/s] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 1255/1288 [11:19<00:16,  1.95it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 1256/1288 [11:20<00:16,  1.92it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 1257/1288 [11:20<00:16,  1.90it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 1258/1288 [11:21<00:15,  1.88it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1259/1288 [11:21<00:15,  1.86it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1260/1288 [11:22<00:15,  1.86it/s]                                                                                                                                                        {'loss': 5.8971, 'grad_norm': 2.59375, 'learning_rate': 0.00013844344415676058, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2919.3, 'epoch': 3.9}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1260/1288 [11:22<00:15,  1.86it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 1261/1288 [11:22<00:14,  1.90it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 1262/1288 [11:23<00:13,  1.92it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 1263/1288 [11:23<00:12,  1.97it/s] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 1264/1288 [11:24<00:11,  2.01it/s] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 1265/1288 [11:24<00:11,  1.96it/s] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 1266/1288 [11:25<00:11,  1.96it/s] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 1267/1288 [11:25<00:11,  1.81it/s] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 1268/1288 [11:26<00:10,  1.89it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 1269/1288 [11:26<00:09,  1.90it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 1270/1288 [11:27<00:09,  1.96it/s]                                                                                                                                                        {'loss': 6.2062, 'grad_norm': 4.1875, 'learning_rate': 5.9442634558792845e-05, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 2872.91, 'epoch': 3.93}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 1270/1288 [11:27<00:09,  1.96it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 1271/1288 [11:27<00:08,  1.95it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 1272/1288 [11:28<00:08,  1.99it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 1273/1288 [11:28<00:07,  2.03it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 1274/1288 [11:29<00:06,  2.00it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 1275/1288 [11:29<00:06,  1.96it/s] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 1276/1288 [11:30<00:05,  2.01it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 1277/1288 [11:30<00:05,  2.15it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 1278/1288 [11:31<00:04,  2.01it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 1279/1288 [11:31<00:04,  2.03it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 1280/1288 [11:32<00:03,  2.24it/s]                                                                                                                                                        {'loss': 6.2402, 'grad_norm': 6.0625, 'learning_rate': 1.3339594418138035e-05, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'tokens_per_second_per_gpu': 3539.23, 'epoch': 3.96}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 1280/1288 [11:32<00:03,  2.24it/s] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1281/1288 [11:32<00:03,  2.10it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1282/1288 [11:33<00:02,  2.07it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 1283/1288 [11:33<00:02,  2.08it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1284/1288 [11:34<00:01,  2.05it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1285/1288 [11:34<00:01,  2.06it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 1286/1288 [11:35<00:00,  2.07it/s]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 1287/1288 [11:35<00:00,  1.99it/s]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1288/1288 [11:36<00:00,  2.08it/s][2025-12-13 20:50:49,898] [INFO] [axolotl.core.trainers.base._save:671] [PID:1121389] Saving model checkpoint to ./out_6_lima/checkpoint-1288
                                                                                                                                                        {'train_runtime': 701.5896, 'train_samples_per_second': 1.836, 'train_steps_per_second': 1.836, 'train_loss': 6.3926010724180236, 'memory/max_active (GiB)': 6.75, 'memory/max_allocated (GiB)': 6.75, 'memory/device_reserved (GiB)': 7.17, 'epoch': 3.99}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1288/1288 [11:38<00:00,  2.08it/s]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1288/1288 [11:38<00:00,  1.84it/s]
[2025-12-13 20:50:52,468] [INFO] [axolotl.train.save_trained_model:225] [PID:1121389] Training completed! Saving trained model to ./out_6_lima.
[2025-12-13 20:50:54,912] [INFO] [axolotl.train.save_trained_model:346] [PID:1121389] Model successfully saved to ./out_6_lima