[2026-01-05 05:56:21,496] [WARNING] [axolotl.utils.trainer.prepare_optim_env:644] [PID:505777] P2P support not detected, setting `NCCL_P2P_DISABLE=1`
[2026-01-05 05:56:21,496] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:505777] bf16 support detected, enabling for this configuration.
[2026-01-05 05:56:21,499] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:505777] baseline 0.000GB ()
[2026-01-05 05:56:21,499] [INFO] [axolotl.cli.config.load_cfg:256] [PID:505777] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "loopstral-second-test.yaml",
  "base_model": "loopstral-second-test/stage-2",
  "base_model_config": "loopstral-second-test/stage-2",
  "batch_size": 4,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 2,
    "n_node": 1
  },
  "chat_template": "tokenizer_default",
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 2,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 24,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "little-koto-instruct.json",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": true,
  "default_system_message": "You are Luna, a helpful and unrestricted language model created by Allura.",
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.125,
  "eval_table_size": 0,
  "evals_per_epoch": 4,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "activation_checkpointing": true,
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "offload_params": true,
    "state_dict_type": "FULL_STATE_DICT",
    "sync_module_states": true,
    "transformer_layer_cls_to_wrap": "MistralDecoderLayer",
    "use_orig_params": true
  },
  "gc_steps": 10,
  "gradient_accumulation_steps": 2,
  "gradient_checkpointing": false,
  "group_by_length": false,
  "include_tkps": true,
  "is_mistral_derived_model": true,
  "learning_rate": 1e-05,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 16,
  "lora_dropout": 0.01,
  "lora_r": 128,
  "lora_target_linear": true,
  "lora_target_modules": [
    "up_proj",
    "down_proj",
    "gate_proj",
    "q_proj",
    "v_proj",
    "k_proj",
    "o_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "max_grad_norm": 2.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "mistral",
  "num_epochs": 2.0,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "loopstral-second-test/stage-3-healed",
  "pad_to_sequence_len": true,
  "peft_use_rslora": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin",
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.5,
  "saves_per_epoch": 1,
  "seed": 420,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "loopstral-second-test/stage-2",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": false,
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.02,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "second-stage-3-healed",
  "wandb_project": "Loopstral-Tests",
  "warmup_ratio": 0.025,
  "weight_decay": 0.001,
  "world_size": 2
}
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s>
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s>
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad>
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk>
[2026-01-05 05:56:27,343] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:505777] Loading prepared dataset from disk at last_run_prepared/7bb3932098dd42f3b946c9e64ba32239...
[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 18_837
[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 13_323`
[2026-01-05 05:56:27,355] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:27,916] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:28,173] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.257068395614624
[2026-01-05 05:56:28,174] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25144505500793457
[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25107741355895996
[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:28,927] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25074076652526855
[2026-01-05 05:56:29,428] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]
[2026-01-05 05:56:29,485] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 1
[2026-01-05 05:56:29,499] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.7664794921875, 0.9197753667831421]
[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: None
[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 2
[2026-01-05 05:56:29,505] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 922_178
[2026-01-05 05:56:29,514] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 746_491`
[2026-01-05 05:56:29,525] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:29,778] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252016544342041
[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:30,282] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252286434173584
[2026-01-05 05:56:30,283] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:30,534] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25146055221557617
[2026-01-05 05:56:30,535] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
[2026-01-05 05:56:30,786] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25194621086120605
[2026-01-05 05:56:30,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228]
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 57
[2026-01-05 05:56:30,803] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.9874610304832458, 0.9874610304832458]
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: 0.99
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 114
[2026-01-05 05:56:30,804] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:505777] Maximum number of steps set at 114
[2026-01-05 05:56:30,828] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:505777] loading tokenizer... loopstral-second-test/stage-2
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s>
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s>
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad>
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk>
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:505777] Loading model
[2026-01-05 05:56:30,994] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:505777] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-05 05:56:30,995] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:505777] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-05 05:56:30,995] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:505777] Applying multipack dataloader patch for sample packing...
[2026-01-05 05:56:31,073] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:505777] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
[2026-01-05 05:56:31,145] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:505777] Applying Cut Cross Entropy to model type: mistral
Loading checkpoint shards:   0%|                                                                | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|██████████████████▋                                     | 1/3 [00:01<00:03,  1.52s/it]Loading checkpoint shards:  67%|█████████████████████████████████████▎                  | 2/3 [00:03<00:01,  1.78s/it]Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.58s/it]Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.60s/it]
[2026-01-05 05:56:36,035] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:505777] Converting modules to torch.bfloat16
[2026-01-05 05:56:36,037] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:505777] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
[2026-01-05 05:56:39,795] [INFO] [axolotl.train.save_initial_configs:417] [PID:505777] Pre-saving tokenizer to loopstral-second-test/stage-3-healed...
[2026-01-05 05:56:39,829] [INFO] [axolotl.train.save_initial_configs:422] [PID:505777] Pre-saving model config to loopstral-second-test/stage-3-healed...
[2026-01-05 05:56:39,831] [INFO] [axolotl.train.execute_training:212] [PID:505777] Starting trainer...
[2026-01-05 05:56:42,363] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0816106796264648
[2026-01-05 05:56:43,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0880126953125
[2026-01-05 05:56:44,524] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0731561183929443
[2026-01-05 05:56:45,606] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0811669826507568
[2026-01-05 05:56:45,607] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228]
[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralForCausalLM because mixed precision turned on in FSDP. Affects: model.embed_tokens.weight, model.norm.weight, lm_head.weight.
  warnings.warn(

[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralDecoderLayer because mixed precision turned on in FSDP. Affects: self_attn.q_proj.weight, self_attn.k_proj.weight, self_attn.v_proj.weight, self_attn.o_proj.weight, mlp.gate_proj.weight, mlp.up_proj.weight, mlp.down_proj.weight, input_layernorm.weight, post_attention_layernorm.weight.
  warnings.warn(

[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1974: UserWarning: FSDP upcast of low precision parameters may affect the precision of model checkpoints.
  warnings.warn(

[34m[1mwandb[0m: Currently logged in as: [33mcooawoo[0m ([33mcooawoo-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run 90pp12rs (0.2s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run 90pp12rs (0.2s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.23.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/aibox/training/wandb/run-20260105_055655-90pp12rs[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msecond-stage-3-healed[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests/runs/90pp12rs[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-01-05 05:56:58,375] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:505777] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                         | 0/114 [00:00<?, ?it/s][2026-01-05 05:56:58,382] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 05:57:01,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.6251494884490967
[2026-01-05 05:57:03,393] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.645111322402954
[2026-01-05 05:57:04,955] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5617477893829346
[2026-01-05 05:57:06,522] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5672385692596436
[2026-01-05 05:57:06,523] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.66s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 1.1707839965820312, 'eval_runtime': 35.4697, 'eval_samples_per_second': 0.536, 'eval_steps_per_second': 0.282, 'eval_ppl': 3.2245, 'memory/max_active (GiB)': 3.76, 'memory/max_allocated (GiB)': 3.76, 'memory/device_reserved (GiB)': 9.22, 'epoch': 0}
  0%|                                                                                         | 0/114 [00:43<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.66s/it][A
                                                                                                                      [A  1%|▋                                                                              | 1/114 [01:12<2:16:26, 72.45s/it]                                                                                                                      {'loss': 1.2216, 'grad_norm': 19.611356735229492, 'learning_rate': 0.0, 'ppl': 3.3926, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 420.22, 'total_tokens': 23701, 'epoch': 0.02}
  1%|▋                                                                              | 1/114 [01:12<2:16:26, 72.45s/it]  2%|█▍                                                                             | 2/114 [01:29<1:14:56, 40.15s/it]                                                                                                                      {'loss': 1.3721, 'grad_norm': 22.5228214263916, 'learning_rate': 5e-06, 'ppl': 3.9436, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 438.27, 'total_tokens': 36593, 'epoch': 0.04}
  2%|█▍                                                                             | 2/114 [01:29<1:14:56, 40.15s/it]  3%|██▏                                                                              | 3/114 [01:47<55:10, 29.82s/it]                                                                                                                      {'loss': 1.2242, 'grad_norm': 23.2207088470459, 'learning_rate': 1e-05, 'ppl': 3.4014, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 235.72, 'total_tokens': 47910, 'epoch': 0.05}
  3%|██▏                                                                              | 3/114 [01:47<55:10, 29.82s/it]  4%|██▊                                                                              | 4/114 [02:04<45:44, 24.95s/it]                                                                                                                      {'loss': 1.2708, 'grad_norm': 9.459047317504883, 'learning_rate': 9.998033131915266e-06, 'ppl': 3.5637, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.71, 'total_tokens': 62377, 'epoch': 0.07}
  4%|██▊                                                                              | 4/114 [02:04<45:44, 24.95s/it]  4%|███▌                                                                             | 5/114 [02:22<40:39, 22.38s/it]                                                                                                                      {'loss': 1.0452, 'grad_norm': 8.951719284057617, 'learning_rate': 9.992134075089085e-06, 'ppl': 2.844, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 371.1, 'total_tokens': 74989, 'epoch': 0.09}
  4%|███▌                                                                             | 5/114 [02:22<40:39, 22.38s/it]  5%|████▎                                                                            | 6/114 [02:40<37:09, 20.64s/it]                                                                                                                      {'loss': 1.037, 'grad_norm': 5.003825664520264, 'learning_rate': 9.982307470588097e-06, 'ppl': 2.8207, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.42, 'total_tokens': 88184, 'epoch': 0.11}
  5%|████▎                                                                            | 6/114 [02:40<37:09, 20.64s/it]  6%|████▉                                                                            | 7/114 [02:57<35:04, 19.67s/it]                                                                                                                      {'loss': 1.1807, 'grad_norm': 6.691038131713867, 'learning_rate': 9.968561049466214e-06, 'ppl': 3.2567, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 291.57, 'total_tokens': 100065, 'epoch': 0.12}
  6%|████▉                                                                            | 7/114 [02:57<35:04, 19.67s/it]  7%|█████▋                                                                           | 8/114 [03:15<33:30, 18.96s/it]                                                                                                                      {'loss': 1.1553, 'grad_norm': 4.891448974609375, 'learning_rate': 9.950905626682229e-06, 'ppl': 3.175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 305.04, 'total_tokens': 111981, 'epoch': 0.14}
  7%|█████▋                                                                           | 8/114 [03:15<33:30, 18.96s/it]  8%|██████▍                                                                          | 9/114 [03:32<32:31, 18.59s/it]                                                                                                                      {'loss': 0.9665, 'grad_norm': 4.162895202636719, 'learning_rate': 9.92935509259118e-06, 'ppl': 2.6287, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.6, 'total_tokens': 126309, 'epoch': 0.16}
  8%|██████▍                                                                          | 9/114 [03:32<32:31, 18.59s/it]  9%|███████                                                                         | 10/114 [03:50<31:48, 18.35s/it]                                                                                                                      {'loss': 1.1024, 'grad_norm': 4.0764946937561035, 'learning_rate': 9.903926402016153e-06, 'ppl': 3.0114, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 315.69, 'total_tokens': 138777, 'epoch': 0.18}
  9%|███████                                                                         | 10/114 [03:50<31:48, 18.35s/it] 10%|███████▋                                                                        | 11/114 [04:08<31:01, 18.08s/it]                                                                                                                      {'loss': 0.9937, 'grad_norm': 4.487460613250732, 'learning_rate': 9.874639560909118e-06, 'ppl': 2.7012, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 381.39, 'total_tokens': 152449, 'epoch': 0.19}
 10%|███████▋                                                                        | 11/114 [04:08<31:01, 18.08s/it] 11%|████████▍                                                                       | 12/114 [04:25<30:28, 17.93s/it]                                                                                                                      {'loss': 1.0321, 'grad_norm': 4.153564453125, 'learning_rate': 9.841517610611309e-06, 'ppl': 2.807, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.38, 'total_tokens': 165644, 'epoch': 0.21}
 11%|████████▍                                                                       | 12/114 [04:25<30:28, 17.93s/it] 11%|█████████                                                                       | 13/114 [04:43<30:05, 17.88s/it]                                                                                                                      {'loss': 0.8913, 'grad_norm': 5.131930828094482, 'learning_rate': 9.804586609725499e-06, 'ppl': 2.4383, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.4, 'total_tokens': 177744, 'epoch': 0.23}
 11%|█████████                                                                       | 13/114 [04:43<30:05, 17.88s/it] 12%|█████████▊                                                                      | 14/114 [05:01<29:39, 17.80s/it]                                                                                                                      {'loss': 0.862, 'grad_norm': 4.371148109436035, 'learning_rate': 9.763875613614482e-06, 'ppl': 2.3679, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 316.1, 'total_tokens': 190738, 'epoch': 0.25}
 12%|█████████▊                                                                      | 14/114 [05:01<29:39, 17.80s/it] 13%|██████████▌                                                                     | 15/114 [05:19<29:34, 17.92s/it]                                                                                                                      {'loss': 1.1077, 'grad_norm': 4.898997783660889, 'learning_rate': 9.719416651541839e-06, 'ppl': 3.0274, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.3, 'total_tokens': 203458, 'epoch': 0.26}
 13%|██████████▌                                                                     | 15/114 [05:19<29:34, 17.92s/it][2026-01-05 06:02:17,805] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:02:21,514] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.7565350532531738
[2026-01-05 06:02:23,355] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8411920070648193
[2026-01-05 06:02:25,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8490314483642578
[2026-01-05 06:02:27,076] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8708629608154297
[2026-01-05 06:02:27,078] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.77s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.8464773297309875, 'eval_runtime': 10.0269, 'eval_samples_per_second': 1.895, 'eval_steps_per_second': 0.997, 'eval_ppl': 2.3314, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.26}
 13%|██████████▌                                                                     | 15/114 [05:38<29:34, 17.92s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.77s/it][A
                                                                                                                      [A 14%|███████████▏                                                                    | 16/114 [05:57<39:07, 23.96s/it]                                                                                                                      {'loss': 0.9622, 'grad_norm': 3.9995832443237305, 'learning_rate': 9.671244701472999e-06, 'ppl': 2.6174, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 315.28, 'total_tokens': 227749, 'epoch': 0.28}
 14%|███████████▏                                                                    | 16/114 [05:57<39:07, 23.96s/it] 15%|███████████▉                                                                    | 17/114 [06:16<36:20, 22.47s/it]                                                                                                                      {'loss': 1.1259, 'grad_norm': 4.939190864562988, 'learning_rate': 9.619397662556434e-06, 'ppl': 3.083, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 279.61, 'total_tokens': 238481, 'epoch': 0.3}
 15%|███████████▉                                                                    | 17/114 [06:16<36:20, 22.47s/it] 16%|████████████▋                                                                   | 18/114 [06:34<33:40, 21.05s/it]                                                                                                                      {'loss': 1.038, 'grad_norm': 4.293745040893555, 'learning_rate': 9.563916325306595e-06, 'ppl': 2.8236, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.17, 'total_tokens': 251682, 'epoch': 0.32}
 16%|████████████▋                                                                   | 18/114 [06:34<33:40, 21.05s/it] 17%|█████████████▎                                                                  | 19/114 [06:51<31:46, 20.07s/it]                                                                                                                      {'loss': 0.9465, 'grad_norm': 4.432325839996338, 'learning_rate': 9.504844339512096e-06, 'ppl': 2.5767, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 338.71, 'total_tokens': 263764, 'epoch': 0.33}
 17%|█████████████▎                                                                  | 19/114 [06:51<31:46, 20.07s/it] 18%|██████████████                                                                  | 20/114 [07:09<30:29, 19.46s/it]                                                                                                                      {'loss': 1.0822, 'grad_norm': 4.204977512359619, 'learning_rate': 9.442228179894362e-06, 'ppl': 2.9512, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.66, 'total_tokens': 276958, 'epoch': 0.35}
 18%|██████████████                                                                  | 20/114 [07:09<30:29, 19.46s/it] 18%|██████████████▋                                                                 | 21/114 [07:27<29:28, 19.01s/it]                                                                                                                      {'loss': 0.9275, 'grad_norm': 4.079134464263916, 'learning_rate': 9.376117109543769e-06, 'ppl': 2.5282, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 361.9, 'total_tokens': 289687, 'epoch': 0.37}
 18%|██████████████▋                                                                 | 21/114 [07:27<29:28, 19.01s/it] 19%|███████████████▍                                                                | 22/114 [07:45<28:29, 18.58s/it]                                                                                                                      {'loss': 1.0468, 'grad_norm': 19.045801162719727, 'learning_rate': 9.306563141162046e-06, 'ppl': 2.8485, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 404.94, 'total_tokens': 303849, 'epoch': 0.39}
 19%|███████████████▍                                                                | 22/114 [07:45<28:29, 18.58s/it] 20%|████████████████▏                                                               | 23/114 [08:03<27:58, 18.44s/it]                                                                                                                      {'loss': 0.9479, 'grad_norm': 4.227816581726074, 'learning_rate': 9.233620996141421e-06, 'ppl': 2.5803, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 399.93, 'total_tokens': 318222, 'epoch': 0.4}
 20%|████████████████▏                                                               | 23/114 [08:03<27:58, 18.44s/it] 21%|████████████████▊                                                               | 24/114 [08:21<27:18, 18.21s/it]                                                                                                                      {'loss': 0.8656, 'grad_norm': 4.401808261871338, 'learning_rate': 9.157348061512728e-06, 'ppl': 2.3764, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 424.57, 'total_tokens': 331525, 'epoch': 0.42}
 21%|████████████████▊                                                               | 24/114 [08:21<27:18, 18.21s/it] 22%|█████████████████▌                                                              | 25/114 [08:39<26:47, 18.06s/it]                                                                                                                      {'loss': 0.9866, 'grad_norm': 4.104758262634277, 'learning_rate': 9.077804344796302e-06, 'ppl': 2.6821, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 342.8, 'total_tokens': 344242, 'epoch': 0.44}
 22%|█████████████████▌                                                              | 25/114 [08:39<26:47, 18.06s/it] 23%|██████████████████▏                                                             | 26/114 [08:56<26:17, 17.92s/it]                                                                                                                      {'loss': 0.8213, 'grad_norm': 3.642549514770508, 'learning_rate': 8.995052426791247e-06, 'ppl': 2.2735, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 425.35, 'total_tokens': 358434, 'epoch': 0.46}
 23%|██████████████████▏                                                             | 26/114 [08:56<26:17, 17.92s/it] 24%|██████████████████▉                                                             | 27/114 [09:14<25:57, 17.91s/it]                                                                                                                      {'loss': 0.7607, 'grad_norm': 3.46269154548645, 'learning_rate': 8.90915741234015e-06, 'ppl': 2.1398, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.27, 'total_tokens': 373583, 'epoch': 0.47}
 24%|██████████████████▉                                                             | 27/114 [09:14<25:57, 17.91s/it] 25%|███████████████████▋                                                            | 28/114 [09:32<25:31, 17.80s/it]                                                                                                                      {'loss': 0.8874, 'grad_norm': 4.47441291809082, 'learning_rate': 8.820186879108038e-06, 'ppl': 2.4288, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.56, 'total_tokens': 388098, 'epoch': 0.49}
 25%|███████████████████▋                                                            | 28/114 [09:32<25:31, 17.80s/it] 25%|████████████████████▎                                                           | 29/114 [09:49<25:11, 17.79s/it]                                                                                                                      {'loss': 0.9208, 'grad_norm': 7.110525131225586, 'learning_rate': 8.728210824415829e-06, 'ppl': 2.5113, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 378.74, 'total_tokens': 401056, 'epoch': 0.51}
 25%|████████████████████▎                                                           | 29/114 [09:49<25:11, 17.79s/it] 26%|█████████████████████                                                           | 30/114 [10:07<24:54, 17.80s/it]                                                                                                                      {'loss': 0.9311, 'grad_norm': 4.498164176940918, 'learning_rate': 8.633301610170136e-06, 'ppl': 2.5373, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 382.0, 'total_tokens': 414830, 'epoch': 0.53}
 26%|█████████████████████                                                           | 30/114 [10:07<24:54, 17.80s/it][2026-01-05 06:07:06,008] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:07:09,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8823564052581787
[2026-01-05 06:07:11,720] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.9062902927398682
[2026-01-05 06:07:13,619] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8988819122314453
[2026-01-05 06:07:15,483] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.863325834274292
[2026-01-05 06:07:15,484] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.28s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.8130025267601013, 'eval_runtime': 9.0489, 'eval_samples_per_second': 2.1, 'eval_steps_per_second': 1.105, 'eval_ppl': 2.2547, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.53}
 26%|█████████████████████                                                           | 30/114 [10:26<24:54, 17.80s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.28s/it][A
                                                                                                                      [A 27%|█████████████████████▊                                                          | 31/114 [10:43<32:09, 23.25s/it]                                                                                                                      {'loss': 0.9568, 'grad_norm': 4.3019700050354, 'learning_rate': 8.535533905932739e-06, 'ppl': 2.6034, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.97, 'total_tokens': 438063, 'epoch': 0.54}
 27%|█████████████████████▊                                                          | 31/114 [10:43<32:09, 23.25s/it] 28%|██████████████████████▍                                                         | 32/114 [11:01<29:36, 21.66s/it]                                                                                                                      {'loss': 1.4274, 'grad_norm': 13.445784568786621, 'learning_rate': 8.43498463017451e-06, 'ppl': 4.1678, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 411.95, 'total_tokens': 449755, 'epoch': 0.56}
 28%|██████████████████████▍                                                         | 32/114 [11:01<29:36, 21.66s/it] 29%|███████████████████████▏                                                        | 33/114 [11:19<27:33, 20.41s/it]                                                                                                                      {'loss': 0.9124, 'grad_norm': 4.726632595062256, 'learning_rate': 8.331732889760021e-06, 'ppl': 2.4903, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 430.35, 'total_tokens': 464296, 'epoch': 0.58}
 29%|███████████████████████▏                                                        | 33/114 [11:19<27:33, 20.41s/it] 30%|███████████████████████▊                                                        | 34/114 [11:36<26:05, 19.57s/it]                                                                                                                      {'loss': 0.9753, 'grad_norm': 4.2374067306518555, 'learning_rate': 8.22585991771044e-06, 'ppl': 2.652, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 386.8, 'total_tokens': 476505, 'epoch': 0.6}
 30%|███████████████████████▊                                                        | 34/114 [11:36<26:05, 19.57s/it] 31%|████████████████████████▌                                                       | 35/114 [11:54<25:01, 19.00s/it]                                                                                                                      {'loss': 0.9245, 'grad_norm': 3.9087975025177, 'learning_rate': 8.117449009293668e-06, 'ppl': 2.5206, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 364.2, 'total_tokens': 490425, 'epoch': 0.61}
 31%|████████████████████████▌                                                       | 35/114 [11:54<25:01, 19.00s/it] 32%|█████████████████████████▎                                                      | 36/114 [12:12<24:21, 18.73s/it]                                                                                                                      {'loss': 0.889, 'grad_norm': 4.065995216369629, 'learning_rate': 8.00658545649203e-06, 'ppl': 2.4327, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.93, 'total_tokens': 504789, 'epoch': 0.63}
 32%|█████████████████████████▎                                                      | 36/114 [12:12<24:21, 18.73s/it] 32%|█████████████████████████▉                                                      | 37/114 [12:30<23:38, 18.42s/it]                                                                                                                      {'loss': 0.8679, 'grad_norm': 4.762394428253174, 'learning_rate': 7.89335648089903e-06, 'ppl': 2.3819, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 403.62, 'total_tokens': 518553, 'epoch': 0.65}
 32%|█████████████████████████▉                                                      | 37/114 [12:30<23:38, 18.42s/it] 33%|██████████████████████████▋                                                     | 38/114 [12:48<23:08, 18.27s/it]                                                                                                                      {'loss': 1.0176, 'grad_norm': 15.393413543701172, 'learning_rate': 7.777851165098012e-06, 'ppl': 2.7665, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 294.78, 'total_tokens': 531189, 'epoch': 0.67}
 33%|██████████████████████████▋                                                     | 38/114 [12:48<23:08, 18.27s/it] 34%|███████████████████████████▎                                                    | 39/114 [13:05<22:33, 18.04s/it]                                                                                                                      {'loss': 0.9678, 'grad_norm': 3.8640310764312744, 'learning_rate': 7.660160382576683e-06, 'ppl': 2.6321, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.2, 'total_tokens': 544812, 'epoch': 0.68}
 34%|███████████████████████████▎                                                    | 39/114 [13:05<22:33, 18.04s/it] 35%|████████████████████████████                                                    | 40/114 [13:23<22:18, 18.09s/it]                                                                                                                      {'loss': 1.0927, 'grad_norm': 4.1391167640686035, 'learning_rate': 7.540376726232648e-06, 'ppl': 2.9823, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 353.51, 'total_tokens': 558027, 'epoch': 0.7}
 35%|████████████████████████████                                                    | 40/114 [13:23<22:18, 18.09s/it] 36%|████████████████████████████▊                                                   | 41/114 [13:41<21:45, 17.88s/it]                                                                                                                      {'loss': 0.8597, 'grad_norm': 3.4433345794677734, 'learning_rate': 7.4185944355261996e-06, 'ppl': 2.3625, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 410.17, 'total_tokens': 572138, 'epoch': 0.72}
 36%|████████████████████████████▊                                                   | 41/114 [13:41<21:45, 17.88s/it] 37%|█████████████████████████████▍                                                  | 42/114 [13:58<21:19, 17.77s/it]                                                                                                                      {'loss': 0.8931, 'grad_norm': 3.9547741413116455, 'learning_rate': 7.294909322337689e-06, 'ppl': 2.4427, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 374.06, 'total_tokens': 585392, 'epoch': 0.74}
 37%|█████████████████████████████▍                                                  | 42/114 [13:58<21:19, 17.77s/it] 38%|██████████████████████████████▏                                                 | 43/114 [14:16<21:04, 17.81s/it]                                                                                                                      {'loss': 0.9103, 'grad_norm': 7.8218255043029785, 'learning_rate': 7.169418695587791e-06, 'ppl': 2.4851, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.66, 'total_tokens': 599321, 'epoch': 0.75}
 38%|██████████████████████████████▏                                                 | 43/114 [14:16<21:04, 17.81s/it] 39%|██████████████████████████████▉                                                 | 44/114 [14:34<20:41, 17.73s/it]                                                                                                                      {'loss': 0.8742, 'grad_norm': 4.100659370422363, 'learning_rate': 7.042221284679982e-06, 'ppl': 2.397, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 277.15, 'total_tokens': 611687, 'epoch': 0.77}
 39%|██████████████████████████████▉                                                 | 44/114 [14:34<20:41, 17.73s/it] 39%|███████████████████████████████▌                                                | 45/114 [14:52<20:32, 17.87s/it]                                                                                                                      {'loss': 1.0304, 'grad_norm': 4.246405124664307, 'learning_rate': 6.913417161825449e-06, 'ppl': 2.8022, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.16, 'total_tokens': 624725, 'epoch': 0.79}
 39%|███████████████████████████████▌                                                | 45/114 [14:52<20:32, 17.87s/it][2026-01-05 06:11:50,684] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:11:54,496] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8704819679260254
[2026-01-05 06:11:56,390] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.893763780593872
[2026-01-05 06:11:58,227] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.836214542388916
[2026-01-05 06:12:00,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8623669147491455
[2026-01-05 06:12:00,091] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.30s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.8040304780006409, 'eval_runtime': 9.0899, 'eval_samples_per_second': 2.09, 'eval_steps_per_second': 1.1, 'eval_ppl': 2.2345, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.79}
 39%|███████████████████████████████▌                                                | 45/114 [15:10<20:32, 17.87s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.30s/it][A
                                                                                                                      [A 40%|████████████████████████████████▎                                               | 46/114 [15:28<26:29, 23.37s/it]                                                                                                                      {'loss': 0.936, 'grad_norm': 4.026333332061768, 'learning_rate': 6.783107663311566e-06, 'ppl': 2.5498, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 425.2, 'total_tokens': 648077, 'epoch': 0.81}
 40%|████████████████████████████████▎                                               | 46/114 [15:28<26:29, 23.37s/it] 41%|████████████████████████████████▉                                               | 47/114 [15:46<24:11, 21.67s/it]                                                                                                                      {'loss': 0.893, 'grad_norm': 3.673527717590332, 'learning_rate': 6.651395309775837e-06, 'ppl': 2.4424, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 394.49, 'total_tokens': 662323, 'epoch': 0.82}
 41%|████████████████████████████████▉                                               | 47/114 [15:46<24:11, 21.67s/it] 42%|█████████████████████████████████▋                                              | 48/114 [16:03<22:26, 20.40s/it]                                                                                                                      {'loss': 1.0356, 'grad_norm': 6.087688446044922, 'learning_rate': 6.518383725548074e-06, 'ppl': 2.8168, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.91, 'total_tokens': 674394, 'epoch': 0.84}
 42%|█████████████████████████████████▋                                              | 48/114 [16:03<22:26, 20.40s/it] 43%|██████████████████████████████████▍                                             | 49/114 [16:21<21:16, 19.64s/it]                                                                                                                      {'loss': 0.8771, 'grad_norm': 3.8041579723358154, 'learning_rate': 6.384177557124247e-06, 'ppl': 2.4039, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 366.56, 'total_tokens': 687865, 'epoch': 0.86}
 43%|██████████████████████████████████▍                                             | 49/114 [16:21<21:16, 19.64s/it] 44%|███████████████████████████████████                                             | 50/114 [16:39<20:26, 19.16s/it]                                                                                                                      {'loss': 0.991, 'grad_norm': 3.9628713130950928, 'learning_rate': 6.248882390836135e-06, 'ppl': 2.6939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 383.67, 'total_tokens': 700828, 'epoch': 0.88}
 44%|███████████████████████████████████                                             | 50/114 [16:39<20:26, 19.16s/it] 45%|███████████████████████████████████▊                                            | 51/114 [16:57<19:36, 18.67s/it]                                                                                                                      {'loss': 0.8278, 'grad_norm': 3.998246669769287, 'learning_rate': 6.112604669781572e-06, 'ppl': 2.2883, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 396.65, 'total_tokens': 713732, 'epoch': 0.89}
 45%|███████████████████████████████████▊                                            | 51/114 [16:57<19:36, 18.67s/it] 46%|████████████████████████████████████▍                                           | 52/114 [17:14<18:54, 18.30s/it]                                                                                                                      {'loss': 0.8638, 'grad_norm': 4.033308506011963, 'learning_rate': 5.975451610080643e-06, 'ppl': 2.3722, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 336.52, 'total_tokens': 726692, 'epoch': 0.91}
 46%|████████████████████████████████████▍                                           | 52/114 [17:14<18:54, 18.30s/it] 46%|█████████████████████████████████████▏                                          | 53/114 [17:32<18:25, 18.13s/it]                                                                                                                      {'loss': 1.3544, 'grad_norm': 17.30064582824707, 'learning_rate': 5.837531116523683e-06, 'ppl': 3.8744, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 403.9, 'total_tokens': 739602, 'epoch': 0.93}
 46%|█████████████████████████████████████▏                                          | 53/114 [17:32<18:25, 18.13s/it] 47%|█████████████████████████████████████▉                                          | 54/114 [17:49<17:54, 17.92s/it]                                                                                                                      {'loss': 0.9146, 'grad_norm': 3.651843309402466, 'learning_rate': 5.698951697677498e-06, 'ppl': 2.4958, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 410.43, 'total_tokens': 753378, 'epoch': 0.95}
 47%|█████████████████████████████████████▉                                          | 54/114 [17:49<17:54, 17.92s/it] 48%|██████████████████████████████████████▌                                         | 55/114 [18:07<17:40, 17.97s/it]                                                                                                                      {'loss': 1.0104, 'grad_norm': 4.368696212768555, 'learning_rate': 5.559822380516539e-06, 'ppl': 2.7467, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 382.87, 'total_tokens': 767289, 'epoch': 0.96}
 48%|██████████████████████████████████████▌                                         | 55/114 [18:07<17:40, 17.97s/it] 49%|███████████████████████████████████████▎                                        | 56/114 [18:25<17:13, 17.81s/it]                                                                                                                      {'loss': 1.0087, 'grad_norm': 5.310736656188965, 'learning_rate': 5.420252624646238e-06, 'ppl': 2.742, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 311.44, 'total_tokens': 777836, 'epoch': 0.98}
 49%|███████████████████████████████████████▎                                        | 56/114 [18:25<17:13, 17.81s/it] 50%|████████████████████████████████████████                                        | 57/114 [18:44<17:18, 18.22s/it]                                                                                                                      {'loss': 0.8086, 'grad_norm': 4.207986831665039, 'learning_rate': 5.2803522361859596e-06, 'ppl': 2.2448, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 289.66, 'total_tokens': 788519, 'epoch': 1.0}
 50%|████████████████████████████████████████                                        | 57/114 [18:44<17:18, 18.22s/it][2026-01-05 06:15:42,787] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-05 06:16:05,005] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-57
 51%|███████████████████████████████████████▋                                      | 58/114 [21:43<1:02:04, 66.51s/it]                                                                                                                      {'loss': 0.6146, 'grad_norm': 3.4813573360443115, 'learning_rate': 5.140231281379345e-06, 'ppl': 1.8489, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 261.61, 'total_tokens': 801358, 'epoch': 1.02}
 51%|███████████████████████████████████████▋                                      | 58/114 [21:44<1:02:04, 66.51s/it] 52%|█████████████████████████████████████████▍                                      | 59/114 [22:03<48:05, 52.46s/it]                                                                                                                      {'loss': 0.6649, 'grad_norm': 3.9285855293273926, 'learning_rate': 5e-06, 'ppl': 1.9443, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 407.5, 'total_tokens': 814250, 'epoch': 1.04}
 52%|█████████████████████████████████████████▍                                      | 59/114 [22:03<48:05, 52.46s/it] 53%|██████████████████████████████████████████                                      | 60/114 [22:21<37:57, 42.17s/it]                                                                                                                      {'loss': 0.6454, 'grad_norm': 3.718013286590576, 'learning_rate': 4.859768718620656e-06, 'ppl': 1.9067, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 227.37, 'total_tokens': 825567, 'epoch': 1.05}
 53%|██████████████████████████████████████████                                      | 60/114 [22:21<37:57, 42.17s/it][2026-01-05 06:19:19,814] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:19:24,401] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.28102445602417
[2026-01-05 06:19:26,656] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2544238567352295
[2026-01-05 06:19:28,921] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2653965950012207
[2026-01-05 06:19:31,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2563204765319824
[2026-01-05 06:19:31,179] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.48s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.7972212433815002, 'eval_runtime': 9.5865, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 1.043, 'eval_ppl': 2.2194, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.05}
 53%|██████████████████████████████████████████                                      | 60/114 [22:42<37:57, 42.17s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.48s/it][A
                                                                                                                      [A 54%|██████████████████████████████████████████▊                                     | 61/114 [22:59<36:17, 41.08s/it]                                                                                                                      {'loss': 0.6167, 'grad_norm': 5.092803001403809, 'learning_rate': 4.719647763814041e-06, 'ppl': 1.8528, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.69, 'total_tokens': 850896, 'epoch': 1.07}
 54%|██████████████████████████████████████████▊                                     | 61/114 [22:59<36:17, 41.08s/it] 54%|███████████████████████████████████████████▌                                    | 62/114 [23:17<29:34, 34.13s/it]                                                                                                                      {'loss': 0.4792, 'grad_norm': 5.132570743560791, 'learning_rate': 4.579747375353763e-06, 'ppl': 1.6148, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 378.34, 'total_tokens': 863508, 'epoch': 1.09}
 54%|███████████████████████████████████████████▌                                    | 62/114 [23:17<29:34, 34.13s/it] 55%|████████████████████████████████████████████▏                                   | 63/114 [23:35<24:42, 29.08s/it]                                                                                                                      {'loss': 0.4102, 'grad_norm': 4.274756908416748, 'learning_rate': 4.4401776194834615e-06, 'ppl': 1.5071, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.24, 'total_tokens': 876703, 'epoch': 1.11}
 55%|████████████████████████████████████████████▏                                   | 63/114 [23:35<24:42, 29.08s/it] 56%|████████████████████████████████████████████▉                                   | 64/114 [23:52<21:22, 25.64s/it]                                                                                                                      {'loss': 0.6118, 'grad_norm': 5.282613754272461, 'learning_rate': 4.3010483023225045e-06, 'ppl': 1.8437, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 293.01, 'total_tokens': 888584, 'epoch': 1.12}
 56%|████████████████████████████████████████████▉                                   | 64/114 [23:52<21:22, 25.64s/it] 57%|█████████████████████████████████████████████▌                                  | 65/114 [24:10<18:58, 23.23s/it]                                                                                                                      {'loss': 0.5366, 'grad_norm': 4.78963565826416, 'learning_rate': 4.162468883476319e-06, 'ppl': 1.7102, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 302.8, 'total_tokens': 900500, 'epoch': 1.14}
 57%|█████████████████████████████████████████████▌                                  | 65/114 [24:10<18:58, 23.23s/it] 58%|██████████████████████████████████████████████▎                                 | 66/114 [24:28<17:16, 21.59s/it]                                                                                                                      {'loss': 0.4277, 'grad_norm': 4.404544353485107, 'learning_rate': 4.02454838991936e-06, 'ppl': 1.5337, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.06, 'total_tokens': 914828, 'epoch': 1.16}
 58%|██████████████████████████████████████████████▎                                 | 66/114 [24:28<17:16, 21.59s/it] 59%|███████████████████████████████████████████████                                 | 67/114 [24:45<15:57, 20.37s/it]                                                                                                                      {'loss': 0.4992, 'grad_norm': 4.581356525421143, 'learning_rate': 3.887395330218429e-06, 'ppl': 1.6474, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 321.0, 'total_tokens': 927296, 'epoch': 1.18}
 59%|███████████████████████████████████████████████                                 | 67/114 [24:45<15:57, 20.37s/it] 60%|███████████████████████████████████████████████▋                                | 68/114 [25:03<14:59, 19.56s/it]                                                                                                                      {'loss': 0.4236, 'grad_norm': 3.748277425765991, 'learning_rate': 3.751117609163865e-06, 'ppl': 1.5275, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.86, 'total_tokens': 940968, 'epoch': 1.19}
 60%|███████████████████████████████████████████████▋                                | 68/114 [25:03<14:59, 19.56s/it] 61%|████████████████████████████████████████████████▍                               | 69/114 [25:20<14:13, 18.98s/it]                                                                                                                      {'loss': 0.4085, 'grad_norm': 3.996558427810669, 'learning_rate': 3.6158224428757538e-06, 'ppl': 1.5046, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.06, 'total_tokens': 954163, 'epoch': 1.21}
 61%|████████████████████████████████████████████████▍                               | 69/114 [25:20<14:13, 18.98s/it] 61%|█████████████████████████████████████████████████                               | 70/114 [25:38<13:42, 18.69s/it]                                                                                                                      {'loss': 0.367, 'grad_norm': 3.7183310985565186, 'learning_rate': 3.4816162744519266e-06, 'ppl': 1.4434, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 392.87, 'total_tokens': 966263, 'epoch': 1.23}
 61%|█████████████████████████████████████████████████                               | 70/114 [25:38<13:42, 18.69s/it] 62%|█████████████████████████████████████████████████▊                              | 71/114 [25:56<13:06, 18.29s/it]                                                                                                                      {'loss': 0.3049, 'grad_norm': 3.2405660152435303, 'learning_rate': 3.3486046902241663e-06, 'ppl': 1.3565, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 320.24, 'total_tokens': 979257, 'epoch': 1.25}
 62%|█████████████████████████████████████████████████▊                              | 71/114 [25:56<13:06, 18.29s/it] 63%|██████████████████████████████████████████████████▌                             | 72/114 [26:13<12:34, 17.97s/it]                                                                                                                      {'loss': 0.4782, 'grad_norm': 3.7915146350860596, 'learning_rate': 3.216892336688435e-06, 'ppl': 1.6132, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 352.7, 'total_tokens': 991977, 'epoch': 1.26}
 63%|██████████████████████████████████████████████████▌                             | 72/114 [26:13<12:34, 17.97s/it] 64%|███████████████████████████████████████████████████▏                            | 73/114 [26:31<12:12, 17.88s/it]                                                                                                                      {'loss': 0.4014, 'grad_norm': 3.5778920650482178, 'learning_rate': 3.0865828381745515e-06, 'ppl': 1.4939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.21, 'total_tokens': 1005406, 'epoch': 1.28}
 64%|███████████████████████████████████████████████████▏                            | 73/114 [26:31<12:12, 17.88s/it] 65%|███████████████████████████████████████████████████▉                            | 74/114 [26:48<11:50, 17.77s/it]                                                                                                                      {'loss': 0.5709, 'grad_norm': 5.925182819366455, 'learning_rate': 2.95777871532002e-06, 'ppl': 1.7699, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 291.94, 'total_tokens': 1016138, 'epoch': 1.3}
 65%|███████████████████████████████████████████████████▉                            | 74/114 [26:48<11:50, 17.77s/it] 66%|████████████████████████████████████████████████████▋                           | 75/114 [27:06<11:34, 17.81s/it]                                                                                                                      {'loss': 0.4398, 'grad_norm': 4.066057205200195, 'learning_rate': 2.83058130441221e-06, 'ppl': 1.5524, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 397.09, 'total_tokens': 1029339, 'epoch': 1.32}
 66%|████████████████████████████████████████████████████▋                           | 75/114 [27:06<11:34, 17.81s/it][2026-01-05 06:24:05,049] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:24:09,576] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2422144412994385
[2026-01-05 06:24:11,841] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2649495601654053
[2026-01-05 06:24:14,135] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2932779788970947
[2026-01-05 06:24:16,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.286076068878174
[2026-01-05 06:24:16,423] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.31s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.8332963585853577, 'eval_runtime': 9.262, 'eval_samples_per_second': 2.051, 'eval_steps_per_second': 1.08, 'eval_ppl': 2.3009, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.32}
 66%|████████████████████████████████████████████████████▋                           | 75/114 [27:27<11:34, 17.81s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.31s/it][A
                                                                                                                      [A 67%|█████████████████████████████████████████████████████▎                          | 76/114 [27:44<15:10, 23.95s/it]                                                                                                                      {'loss': 0.4122, 'grad_norm': 3.938491106033325, 'learning_rate': 2.705090677662311e-06, 'ppl': 1.5101, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 341.7, 'total_tokens': 1052283, 'epoch': 1.33}
 67%|█████████████████████████████████████████████████████▎                          | 76/114 [27:44<15:10, 23.95s/it] 68%|██████████████████████████████████████████████████████                          | 77/114 [28:02<13:38, 22.11s/it]                                                                                                                      {'loss': 0.4916, 'grad_norm': 4.054359436035156, 'learning_rate': 2.5814055644738013e-06, 'ppl': 1.6349, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 417.0, 'total_tokens': 1065477, 'epoch': 1.35}
 68%|██████████████████████████████████████████████████████                          | 77/114 [28:02<13:38, 22.11s/it] 68%|██████████████████████████████████████████████████████▋                         | 78/114 [28:20<12:24, 20.69s/it]                                                                                                                      {'loss': 0.3917, 'grad_norm': 4.133195877075195, 'learning_rate': 2.4596232737673544e-06, 'ppl': 1.4795, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 373.89, 'total_tokens': 1078206, 'epoch': 1.37}
 68%|██████████████████████████████████████████████████████▋                         | 78/114 [28:20<12:24, 20.69s/it] 69%|███████████████████████████████████████████████████████▍                        | 79/114 [28:38<11:36, 19.90s/it]                                                                                                                      {'loss': 0.8396, 'grad_norm': 6.9484052658081055, 'learning_rate': 2.339839617423318e-06, 'ppl': 2.3154, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.68, 'total_tokens': 1092368, 'epoch': 1.39}
 69%|███████████████████████████████████████████████████████▍                        | 79/114 [28:38<11:36, 19.90s/it] 70%|████████████████████████████████████████████████████████▏                       | 80/114 [28:55<10:54, 19.24s/it]                                                                                                                      {'loss': 0.5273, 'grad_norm': 5.065030574798584, 'learning_rate': 2.2221488349019903e-06, 'ppl': 1.6944, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 409.1, 'total_tokens': 1106741, 'epoch': 1.4}
 70%|████████████████████████████████████████████████████████▏                       | 80/114 [28:55<10:54, 19.24s/it] 71%|████████████████████████████████████████████████████████▊                       | 81/114 [29:13<10:16, 18.70s/it]                                                                                                                      {'loss': 0.3963, 'grad_norm': 4.900491714477539, 'learning_rate': 2.1066435191009717e-06, 'ppl': 1.4863, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 430.18, 'total_tokens': 1120044, 'epoch': 1.42}
 71%|████████████████████████████████████████████████████████▊                       | 81/114 [29:13<10:16, 18.70s/it] 72%|█████████████████████████████████████████████████████████▌                      | 82/114 [29:30<09:44, 18.27s/it]                                                                                                                      {'loss': 0.4278, 'grad_norm': 3.917367458343506, 'learning_rate': 1.9934145435079705e-06, 'ppl': 1.5339, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.67, 'total_tokens': 1132761, 'epoch': 1.44}
 72%|█████████████████████████████████████████████████████████▌                      | 82/114 [29:30<09:44, 18.27s/it] 73%|██████████████████████████████████████████████████████████▏                     | 83/114 [29:48<09:20, 18.08s/it]                                                                                                                      {'loss': 0.3575, 'grad_norm': 3.7160727977752686, 'learning_rate': 1.8825509907063328e-06, 'ppl': 1.4298, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.13, 'total_tokens': 1146953, 'epoch': 1.46}
 73%|██████████████████████████████████████████████████████████▏                     | 83/114 [29:48<09:20, 18.08s/it] 74%|██████████████████████████████████████████████████████████▉                     | 84/114 [30:05<08:56, 17.89s/it]                                                                                                                      {'loss': 0.2989, 'grad_norm': 3.235100507736206, 'learning_rate': 1.7741400822895633e-06, 'ppl': 1.3484, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 435.06, 'total_tokens': 1162102, 'epoch': 1.47}
 74%|██████████████████████████████████████████████████████████▉                     | 84/114 [30:05<08:56, 17.89s/it] 75%|███████████████████████████████████████████████████████████▋                    | 85/114 [30:23<08:37, 17.83s/it]                                                                                                                      {'loss': 0.4173, 'grad_norm': 3.7896361351013184, 'learning_rate': 1.6682671102399806e-06, 'ppl': 1.5179, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 414.25, 'total_tokens': 1176617, 'epoch': 1.49}
 75%|███████████████████████████████████████████████████████████▋                    | 85/114 [30:23<08:37, 17.83s/it] 75%|████████████████████████████████████████████████████████████▎                   | 86/114 [30:40<08:15, 17.70s/it]                                                                                                                      {'loss': 0.5225, 'grad_norm': 3.7193076610565186, 'learning_rate': 1.5650153698254916e-06, 'ppl': 1.6862, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 386.25, 'total_tokens': 1189575, 'epoch': 1.51}
 75%|████████████████████████████████████████████████████████████▎                   | 86/114 [30:40<08:15, 17.70s/it] 76%|█████████████████████████████████████████████████████████████                   | 87/114 [30:58<08:01, 17.82s/it]                                                                                                                      {'loss': 0.4523, 'grad_norm': 3.9064459800720215, 'learning_rate': 1.4644660940672628e-06, 'ppl': 1.5719, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.18, 'total_tokens': 1203349, 'epoch': 1.53}
 76%|█████████████████████████████████████████████████████████████                   | 87/114 [30:58<08:01, 17.82s/it] 77%|█████████████████████████████████████████████████████████████▊                  | 88/114 [31:16<07:39, 17.68s/it]                                                                                                                      {'loss': 0.4023, 'grad_norm': 3.633103847503662, 'learning_rate': 1.3666983898298659e-06, 'ppl': 1.4953, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.67, 'total_tokens': 1215720, 'epoch': 1.54}
 77%|█████████████████████████████████████████████████████████████▊                  | 88/114 [31:16<07:39, 17.68s/it] 78%|██████████████████████████████████████████████████████████████▍                 | 89/114 [31:33<07:21, 17.67s/it]                                                                                                                      {'loss': 1.3284, 'grad_norm': 9.749296188354492, 'learning_rate': 1.2717891755841722e-06, 'ppl': 3.775, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 409.51, 'total_tokens': 1227412, 'epoch': 1.56}
 78%|██████████████████████████████████████████████████████████████▍                 | 89/114 [31:33<07:21, 17.67s/it] 79%|███████████████████████████████████████████████████████████████▏                | 90/114 [31:51<07:06, 17.78s/it]                                                                                                                      {'loss': 0.4467, 'grad_norm': 3.298785448074341, 'learning_rate': 1.1798131208919628e-06, 'ppl': 1.5631, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.1, 'total_tokens': 1241953, 'epoch': 1.58}
 79%|███████████████████████████████████████████████████████████████▏                | 90/114 [31:51<07:06, 17.78s/it][2026-01-05 06:28:50,293] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:28:54,407] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0476205348968506
[2026-01-05 06:28:56,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0721523761749268
[2026-01-05 06:28:58,500] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.020061492919922
[2026-01-05 06:29:00,540] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0398201942443848
[2026-01-05 06:29:00,623] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.30s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.813373863697052, 'eval_runtime': 9.1778, 'eval_samples_per_second': 2.07, 'eval_steps_per_second': 1.09, 'eval_ppl': 2.2555, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.58}
 79%|███████████████████████████████████████████████████████████████▏                | 90/114 [32:11<07:06, 17.78s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.30s/it][A
                                                                                                                      [A 80%|███████████████████████████████████████████████████████████████▊                | 91/114 [32:28<09:02, 23.57s/it]                                                                                                                      {'loss': 0.4378, 'grad_norm': 3.9499611854553223, 'learning_rate': 1.0908425876598512e-06, 'ppl': 1.5493, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 388.15, 'total_tokens': 1265024, 'epoch': 1.6}
 80%|███████████████████████████████████████████████████████████████▊                | 91/114 [32:28<09:02, 23.57s/it] 81%|████████████████████████████████████████████████████████████████▌               | 92/114 [32:46<08:01, 21.87s/it]                                                                                                                      {'loss': 0.4448, 'grad_norm': 4.025573253631592, 'learning_rate': 1.004947573208756e-06, 'ppl': 1.5602, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.45, 'total_tokens': 1278944, 'epoch': 1.61}
 81%|████████████████████████████████████████████████████████████████▌               | 92/114 [32:46<08:01, 21.87s/it] 82%|█████████████████████████████████████████████████████████████████▎              | 93/114 [33:04<07:11, 20.52s/it]                                                                                                                      {'loss': 0.4514, 'grad_norm': 4.208862781524658, 'learning_rate': 9.221956552036992e-07, 'ppl': 1.5705, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 415.53, 'total_tokens': 1293308, 'epoch': 1.63}
 82%|█████████████████████████████████████████████████████████████████▎              | 93/114 [33:04<07:11, 20.52s/it] 82%|█████████████████████████████████████████████████████████████████▉              | 94/114 [33:21<06:33, 19.69s/it]                                                                                                                      {'loss': 0.4805, 'grad_norm': 5.097556114196777, 'learning_rate': 8.426519384872733e-07, 'ppl': 1.6169, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 402.99, 'total_tokens': 1307072, 'epoch': 1.65}
 82%|█████████████████████████████████████████████████████████████████▉              | 94/114 [33:22<06:33, 19.69s/it] 83%|██████████████████████████████████████████████████████████████████▋             | 95/114 [33:39<06:01, 19.05s/it]                                                                                                                      {'loss': 0.7964, 'grad_norm': 5.547924995422363, 'learning_rate': 7.663790038585794e-07, 'ppl': 2.2175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 300.56, 'total_tokens': 1319708, 'epoch': 1.67}
 83%|██████████████████████████████████████████████████████████████████▋             | 95/114 [33:39<06:01, 19.05s/it] 84%|███████████████████████████████████████████████████████████████████▎            | 96/114 [33:57<05:36, 18.69s/it]                                                                                                                      {'loss': 0.4699, 'grad_norm': 3.6507017612457275, 'learning_rate': 6.934368588379553e-07, 'ppl': 1.5998, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 388.97, 'total_tokens': 1333331, 'epoch': 1.68}
 84%|███████████████████████████████████████████████████████████████████▎            | 96/114 [33:57<05:36, 18.69s/it] 85%|████████████████████████████████████████████████████████████████████            | 97/114 [34:14<05:10, 18.29s/it]                                                                                                                      {'loss': 0.6074, 'grad_norm': 4.161734104156494, 'learning_rate': 6.238828904562316e-07, 'ppl': 1.8357, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 370.47, 'total_tokens': 1346546, 'epoch': 1.7}
 85%|████████████████████████████████████████████████████████████████████            | 97/114 [34:14<05:10, 18.29s/it] 86%|████████████████████████████████████████████████████████████████████▊           | 98/114 [34:32<04:50, 18.14s/it]                                                                                                                      {'loss': 0.4091, 'grad_norm': 3.1512742042541504, 'learning_rate': 5.577718201056392e-07, 'ppl': 1.5055, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 400.58, 'total_tokens': 1360657, 'epoch': 1.72}
 86%|████████████████████████████████████████████████████████████████████▊           | 98/114 [34:32<04:50, 18.14s/it] 87%|█████████████████████████████████████████████████████████████████████▍          | 99/114 [34:49<04:28, 17.90s/it]                                                                                                                      {'loss': 0.4531, 'grad_norm': 3.652284860610962, 'learning_rate': 4.951556604879049e-07, 'ppl': 1.5732, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 378.33, 'total_tokens': 1373911, 'epoch': 1.74}
 87%|█████████████████████████████████████████████████████████████████████▍          | 99/114 [34:49<04:28, 17.90s/it] 88%|█████████████████████████████████████████████████████████████████████▎         | 100/114 [35:08<04:13, 18.13s/it]                                                                                                                      {'loss': 0.6774, 'grad_norm': 13.29174518585205, 'learning_rate': 4.3608367469340553e-07, 'ppl': 1.9688, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 368.21, 'total_tokens': 1387840, 'epoch': 1.75}
 88%|█████████████████████████████████████████████████████████████████████▎         | 100/114 [35:08<04:13, 18.13s/it] 89%|█████████████████████████████████████████████████████████████████████▉         | 101/114 [35:26<03:55, 18.12s/it]                                                                                                                      {'loss': 0.4309, 'grad_norm': 3.5832619667053223, 'learning_rate': 3.8060233744356634e-07, 'ppl': 1.5386, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 268.73, 'total_tokens': 1400206, 'epoch': 1.77}
 89%|█████████████████████████████████████████████████████████████████████▉         | 101/114 [35:26<03:55, 18.12s/it] 89%|██████████████████████████████████████████████████████████████████████▋        | 102/114 [35:44<03:36, 18.05s/it]                                                                                                                      {'loss': 0.5536, 'grad_norm': 3.793043375015259, 'learning_rate': 3.287552985270015e-07, 'ppl': 1.7395, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.95, 'total_tokens': 1413244, 'epoch': 1.79}
 89%|██████████████████████████████████████████████████████████████████████▋        | 102/114 [35:44<03:36, 18.05s/it] 90%|███████████████████████████████████████████████████████████████████████▍       | 103/114 [36:02<03:17, 17.95s/it]                                                                                                                      {'loss': 0.5032, 'grad_norm': 3.7566542625427246, 'learning_rate': 2.8058334845816214e-07, 'ppl': 1.654, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 425.22, 'total_tokens': 1425734, 'epoch': 1.81}
 90%|███████████████████████████████████████████████████████████████████████▍       | 103/114 [36:02<03:17, 17.95s/it] 91%|████████████████████████████████████████████████████████████████████████       | 104/114 [36:19<02:58, 17.83s/it]                                                                                                                      {'loss': 0.4549, 'grad_norm': 3.206256866455078, 'learning_rate': 2.3612438638551837e-07, 'ppl': 1.576, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.45, 'total_tokens': 1439980, 'epoch': 1.82}
 91%|████████████████████████████████████████████████████████████████████████       | 104/114 [36:19<02:58, 17.83s/it] 92%|████████████████████████████████████████████████████████████████████████▊      | 105/114 [36:37<02:41, 17.89s/it]                                                                                                                      {'loss': 0.6494, 'grad_norm': 4.32829475402832, 'learning_rate': 1.9541339027450256e-07, 'ppl': 1.9144, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 340.64, 'total_tokens': 1452051, 'epoch': 1.84}
 92%|████████████████████████████████████████████████████████████████████████▊      | 105/114 [36:37<02:41, 17.89s/it][2026-01-05 06:33:36,260] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
[2026-01-05 06:33:40,708] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1876299381256104
[2026-01-05 06:33:42,931] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2227494716644287
[2026-01-05 06:33:45,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2258124351501465
[2026-01-05 06:33:47,352] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1935596466064453
[2026-01-05 06:33:47,353] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

  0%|                                                                                           | 0/2 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.28s/it][A                                                                                                                      
                                                                                                                      [A{'eval_loss': 0.8144508004188538, 'eval_runtime': 9.188, 'eval_samples_per_second': 2.068, 'eval_steps_per_second': 1.088, 'eval_ppl': 2.2579, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.84}
 92%|████████████████████████████████████████████████████████████████████████▊      | 105/114 [36:58<02:41, 17.89s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.28s/it][A
                                                                                                                      [A 93%|█████████████████████████████████████████████████████████████████████████▍     | 106/114 [37:15<03:11, 23.93s/it]                                                                                                                      {'loss': 0.4638, 'grad_norm': 3.6060004234313965, 'learning_rate': 1.5848238938869332e-07, 'ppl': 1.5901, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.31, 'total_tokens': 1476384, 'epoch': 1.86}
 93%|█████████████████████████████████████████████████████████████████████████▍     | 106/114 [37:15<03:11, 23.93s/it] 94%|██████████████████████████████████████████████████████████████████████████▏    | 107/114 [37:33<02:35, 22.17s/it]                                                                                                                      {'loss': 0.5655, 'grad_norm': 3.643385648727417, 'learning_rate': 1.253604390908819e-07, 'ppl': 1.7603, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 392.59, 'total_tokens': 1489347, 'epoch': 1.88}
 94%|██████████████████████████████████████████████████████████████████████████▏    | 107/114 [37:33<02:35, 22.17s/it] 95%|██████████████████████████████████████████████████████████████████████████▊    | 108/114 [37:51<02:04, 20.80s/it]                                                                                                                      {'loss': 0.4432, 'grad_norm': 3.812006711959839, 'learning_rate': 9.607359798384785e-08, 'ppl': 1.5577, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 395.61, 'total_tokens': 1502251, 'epoch': 1.89}
 95%|██████████████████████████████████████████████████████████████████████████▊    | 108/114 [37:51<02:04, 20.80s/it] 96%|███████████████████████████████████████████████████████████████████████████▌   | 109/114 [38:09<01:39, 19.86s/it]                                                                                                                      {'loss': 0.4601, 'grad_norm': 3.268007516860962, 'learning_rate': 7.064490740882057e-08, 'ppl': 1.5842, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 331.97, 'total_tokens': 1515211, 'epoch': 1.91}
 96%|███████████████████████████████████████████████████████████████████████████▌   | 109/114 [38:09<01:39, 19.86s/it] 96%|████████████████████████████████████████████████████████████████████████████▏  | 110/114 [38:27<01:17, 19.27s/it]                                                                                                                      {'loss': 1.1937, 'grad_norm': 17.401044845581055, 'learning_rate': 4.909437331777178e-08, 'ppl': 3.2993, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.17, 'total_tokens': 1528121, 'epoch': 1.93}
 96%|████████████████████████████████████████████████████████████████████████████▏  | 110/114 [38:27<01:17, 19.27s/it] 97%|████████████████████████████████████████████████████████████████████████████▉  | 111/114 [38:44<00:56, 18.68s/it]                                                                                                                      {'loss': 0.5327, 'grad_norm': 3.476621627807617, 'learning_rate': 3.143895053378698e-08, 'ppl': 1.7035, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 412.86, 'total_tokens': 1541897, 'epoch': 1.95}
 97%|████████████████████████████████████████████████████████████████████████████▉  | 111/114 [38:44<00:56, 18.68s/it] 98%|█████████████████████████████████████████████████████████████████████████████▌ | 112/114 [39:01<00:36, 18.32s/it]                                                                                                                      {'loss': 0.6603, 'grad_norm': 5.652373313903809, 'learning_rate': 1.769252941190458e-08, 'ppl': 1.9354, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.43, 'total_tokens': 1555808, 'epoch': 1.96}
 98%|█████████████████████████████████████████████████████████████████████████████▌ | 112/114 [39:01<00:36, 18.32s/it] 99%|██████████████████████████████████████████████████████████████████████████████▎| 113/114 [39:19<00:18, 18.10s/it]                                                                                                                      {'loss': 0.6319, 'grad_norm': 4.798173427581787, 'learning_rate': 7.865924910916977e-09, 'ppl': 1.8812, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 309.08, 'total_tokens': 1566355, 'epoch': 1.98}
 99%|██████████████████████████████████████████████████████████████████████████████▎| 113/114 [39:19<00:18, 18.10s/it]100%|███████████████████████████████████████████████████████████████████████████████| 114/114 [39:39<00:00, 18.62s/it]                                                                                                                      {'loss': 0.4319, 'grad_norm': 3.836909055709839, 'learning_rate': 1.9668680847356735e-09, 'ppl': 1.5402, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 292.8, 'total_tokens': 1577038, 'epoch': 2.0}
100%|███████████████████████████████████████████████████████████████████████████████| 114/114 [39:39<00:00, 18.62s/it][2026-01-05 06:36:37,660] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-05 06:36:57,988] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-114
                                                                                                                      {'train_runtime': 2526.3299, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.045, 'train_loss': 0.76008216357022, 'memory/max_active (GiB)': 9.02, 'memory/max_allocated (GiB)': 9.02, 'memory/device_reserved (GiB)': 9.82, 'epoch': 2.0}
100%|███████████████████████████████████████████████████████████████████████████████| 114/114 [42:03<00:00, 18.62s/it]100%|███████████████████████████████████████████████████████████████████████████████| 114/114 [42:03<00:00, 22.13s/it]
[2026-01-05 06:39:01,711] [INFO] [axolotl.train.save_trained_model:233] [PID:505777] Training completed! Saving trained model to loopstral-second-test/stage-3-healed.
[2026-01-05 06:39:01,713] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-05 06:39:23,082] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed
[2026-01-05 06:39:47,279] [INFO] [axolotl.train.save_trained_model:351] [PID:505777] Model successfully saved to loopstral-second-test/stage-3-healed
[0m