[2025-10-07 11:50:13,057] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:8314] bf16 support detected, enabling for this configuration.
[2025-10-07 11:50:13,281] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:8314] baseline 0.000GB ()
[2025-10-07 11:50:13,282] [INFO] [axolotl.cli.config.load_cfg:248] [PID:8314] config:
{
  "activation_offloading": false,
  "adapter": "qlora",
  "axolotl_config_path": "muse-marvin-attn.yaml",
  "base_model": "LatitudeGames/Muse-12B",
  "base_model_config": "LatitudeGames/Muse-12B",
  "batch_size": 8,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 2,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 2,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_prepared_path": "last_run_prepared",
  "dataset_processes": 24,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "grimulkan/LimaRP-augmented",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "data_files": "marvin.json",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "ToastyPigeon/steve-and-marvin",
      "trust_remote_code": false,
      "type": "completion"
    },
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "ToastyPigeon/kimi-stories-completion",
      "trust_remote_code": false,
      "type": "completion"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.1,
  "eval_table_size": 0,
  "evals_per_epoch": 10,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "activation_checkpointing": true,
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "limit_all_gathers": true,
    "offload_params": true,
    "sharding_strategy": "FULL_SHARD",
    "state_dict_type": "FULL_STATE_DICT",
    "sync_module_states": true,
    "transformer_layer_cls_to_wrap": "MistralDecoderLayer",
    "use_orig_params": false
  },
  "gc_steps": 10,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": false,
  "group_by_length": false,
  "hub_model_id": "ToastyPigeon/muse-marvin-lora-2",
  "hub_strategy": "every_save",
  "include_tkps": true,
  "is_mistral_derived_model": true,
  "learning_rate": 1e-05,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "lora_r": 32,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "mistral",
  "num_epochs": 1.0,
  "optimizer": "adamw_torch_fused",
  "output_dir": "ckpts-mmarv",
  "pad_to_sequence_len": true,
  "peft_use_rslora": false,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin",
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.1,
  "save_total_limit": 1,
  "saves_per_epoch": 10,
  "seed": 69,
  "sequence_len": 16384,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "LatitudeGames/Muse-12B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.025,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "r32-qlora-all-linear",
  "wandb_project": "MuseMarvin",
  "warmup_ratio": 0.025,
  "weight_decay": 0.01,
  "world_size": 2
}
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8314] EOS: 131072 / <|im_end|>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8314] BOS: 1 / <s>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8314] PAD: 10 / <pad>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8314] UNK: 0 / <unk>
[2025-10-07 11:50:14,275] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:8314] Loading prepared dataset from disk at last_run_prepared/31b44b9f810943b30f3af91fc7580ba1...
[2025-10-07 11:50:14,288] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:8314] total_num_tokens: 758_181
[2025-10-07 11:50:14,290] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:8314] `total_supervised_tokens: 716_387`
[2025-10-07 11:50:15,905] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6418807506561279
[2025-10-07 11:50:16,550] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6455550193786621
[2025-10-07 11:50:17,196] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6447782516479492
[2025-10-07 11:50:17,842] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6462986469268799
[2025-10-07 11:50:18,408] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
[2025-10-07 11:50:18,460] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:8314] data_loader_len: 5
[2025-10-07 11:50:18,469] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:8314] sample_packing_eff_est across ranks: [0.9845892786979675, 0.9845892786979675]
[2025-10-07 11:50:18,469] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:8314] sample_packing_eff_est: None
[2025-10-07 11:50:18,469] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:8314] total_num_steps: 5
[2025-10-07 11:50:18,562] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:8314] total_num_tokens: 30_240_821
[2025-10-07 11:50:18,664] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:8314] `total_supervised_tokens: 28_482_459`
[2025-10-07 11:50:20,128] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7220749855041504
[2025-10-07 11:50:20,839] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7103662490844727
[2025-10-07 11:50:21,559] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.719578742980957
[2025-10-07 11:50:22,294] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7134625911712646
[2025-10-07 11:50:22,295] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [1861, 1860]
[2025-10-07 11:50:22,296] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:8314] data_loader_len: 232
[2025-10-07 11:50:22,296] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:8314] sample_packing_eff_est across ranks: [0.991807222366333, 0.991807222366333]
[2025-10-07 11:50:22,297] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:8314] sample_packing_eff_est: 1.0
[2025-10-07 11:50:22,297] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:8314] total_num_steps: 232
[2025-10-07 11:50:22,297] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:8314] Maximum number of steps set at 232
[2025-10-07 11:50:22,323] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:8314] Loading tokenizer... LatitudeGames/Muse-12B
[2025-10-07 11:50:23,211] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8314] EOS: 131072 / <|im_end|>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8314] BOS: 1 / <s>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8314] PAD: 10 / <pad>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8314] UNK: 0 / <unk>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:8314] Loading model
[2025-10-07 11:50:23,364] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:8314] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-07 11:50:23,365] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:8314] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-10-07 11:50:23,365] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:8314] Applying multipack dataloader patch for sample packing...
[2025-10-07 11:50:23,385] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:8314] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
[2025-10-07 11:50:23,540] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:8314] Applying Cut Cross Entropy to model type: mistral
Loading checkpoint shards:   0%|                                | 0/5 [00:00<?, ?it/s]Loading checkpoint shards:  20%|████▊                   | 1/5 [00:04<00:18,  4.63s/it]Loading checkpoint shards:  40%|█████████▌              | 2/5 [00:09<00:14,  4.80s/it]Loading checkpoint shards:  60%|██████████████▍         | 3/5 [00:14<00:09,  4.87s/it]Loading checkpoint shards:  80%|███████████████████▏    | 4/5 [00:19<00:04,  4.89s/it]Loading checkpoint shards: 100%|████████████████████████| 5/5 [00:23<00:00,  4.73s/it]Loading checkpoint shards: 100%|████████████████████████| 5/5 [00:23<00:00,  4.77s/it]
[2025-10-07 11:50:47,921] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:8314] Converting modules to torch.bfloat16
[2025-10-07 11:50:47,923] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8314] Memory usage after model load 5.750GB (+5.750GB allocated, +5.797GB reserved)
[2025-10-07 11:50:47,924] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:8314] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 114,032,640 || all params: 12,361,835,520 || trainable%: 0.9225
[2025-10-07 11:50:48,830] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8314] after adapters 0.000GB ()
[2025-10-07 11:50:52,955] [INFO] [axolotl.train.save_initial_configs:408] [PID:8314] Pre-saving adapter config to ckpts-mmarv...
[2025-10-07 11:50:52,977] [INFO] [axolotl.train.save_initial_configs:412] [PID:8314] Pre-saving tokenizer to ckpts-mmarv...
[2025-10-07 11:50:53,168] [INFO] [axolotl.train.save_initial_configs:417] [PID:8314] Pre-saving model config to ckpts-mmarv...
[2025-10-07 11:50:53,171] [INFO] [axolotl.train.execute_training:203] [PID:8314] Starting trainer...
[2025-10-07 11:51:05,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.541344404220581
[2025-10-07 11:51:06,738] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.559652328491211
[2025-10-07 11:51:08,262] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.5244674682617188
[2025-10-07 11:51:09,808] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.54478120803833
[2025-10-07 11:51:09,828] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [1860, 1860]
[34m[1mwandb[0m: Currently logged in as: [33mcooawoo[0m ([33mcooawoo-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run 0vwfkgsm (0.2s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.22.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/training/wandb/run-20251007_115115-0vwfkgsm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mr32-qlora-all-linear[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cooawoo-personal/MuseMarvin[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cooawoo-personal/MuseMarvin/runs/0vwfkgsm[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-10-07 11:51:18,257] [INFO] [axolotl.utils.callbacks.on_train_begin:793] [PID:8314] The Axolotl config has been saved to the WandB run under files.
  0%|                                                         | 0/232 [00:00<?, ?it/s][2025-10-07 11:51:18,258] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 11:51:21,179] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.3499112129211426
[2025-10-07 11:51:22,541] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.361877202987671
[2025-10-07 11:51:23,871] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.3298544883728027
[2025-10-07 11:51:25,162] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.290006399154663
[2025-10-07 11:51:25,163] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:07<01:21,  3.88s/it][A
 13%|██████▌                                           | 3/23 [00:15<01:52,  5.65s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:05,  6.60s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:09,  7.19s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:07,  7.53s/it][A
 30%|███████████████▏                                  | 7/23 [00:48<02:03,  7.72s/it][A
 35%|█████████████████▍                                | 8/23 [00:56<01:57,  7.85s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.01s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.04s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.07s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:29<01:28,  8.08s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:37<01:21,  8.16s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.15s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.14s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.88s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:09<00:47,  7.98s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:17<00:40,  8.04s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:25<00:32,  8.06s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.08s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:42<00:16,  8.14s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:50<00:08,  8.15s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.28s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.5322844982147217, 'eval_runtime': 203.0647, 'eval_samples_per_second': 0.359, 'eval_steps_per_second': 0.182, 'memory/max_active (GiB)': 8.04, 'memory/max_allocated (GiB)': 6.73, 'memory/device_reserved (GiB)': 8.36, 'epoch': 0}
  0%|                                                         | 0/232 [03:29<?, ?it/s]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.28s/it][A
                                                                                      [A  0%|▏                                            | 1/232 [05:12<20:03:02, 312.48s/it]                                                                                      {'loss': 2.5109, 'grad_norm': 0.33189067244529724, 'learning_rate': 0.0, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 4214.45, 'epoch': 0.0}
  0%|▏                                            | 1/232 [05:12<20:03:02, 312.48s/it]  1%|▍                                            | 2/232 [06:56<12:08:54, 190.15s/it]                                                                                      {'loss': 2.6357, 'grad_norm': 0.35216283798217773, 'learning_rate': 2.0000000000000003e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.16, 'epoch': 0.01}
  1%|▍                                            | 2/232 [06:57<12:08:54, 190.15s/it]  1%|▌                                             | 3/232 [08:41<9:36:52, 151.15s/it]                                                                                      {'loss': 2.5537, 'grad_norm': 0.35557371377944946, 'learning_rate': 4.000000000000001e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.76, 'epoch': 0.01}
  1%|▌                                             | 3/232 [08:41<9:36:52, 151.15s/it]  2%|▊                                             | 4/232 [10:25<8:23:35, 132.52s/it]                                                                                      {'loss': 2.5444, 'grad_norm': 0.3299483060836792, 'learning_rate': 6e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 508.06, 'epoch': 0.02}
  2%|▊                                             | 4/232 [10:25<8:23:35, 132.52s/it]  2%|▉                                             | 5/232 [12:06<7:37:27, 120.91s/it]                                                                                      {'loss': 2.5188, 'grad_norm': 0.30499881505966187, 'learning_rate': 8.000000000000001e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.1, 'epoch': 0.02}
  2%|▉                                             | 5/232 [12:06<7:37:27, 120.91s/it]  3%|█▏                                            | 6/232 [13:49<7:12:55, 114.94s/it]                                                                                      {'loss': 2.5716, 'grad_norm': 0.3081396818161011, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 615.36, 'epoch': 0.03}
  3%|█▏                                            | 6/232 [13:49<7:12:55, 114.94s/it]  3%|█▍                                            | 7/232 [15:34<6:58:49, 111.69s/it]                                                                                      {'loss': 2.5446, 'grad_norm': 0.3237297236919403, 'learning_rate': 9.99952117026961e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.2, 'epoch': 0.03}
  3%|█▍                                            | 7/232 [15:34<6:58:49, 111.69s/it]  3%|█▌                                            | 8/232 [17:19<6:49:00, 109.56s/it]                                                                                      {'loss': 2.5747, 'grad_norm': 0.43742334842681885, 'learning_rate': 9.998084772789603e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.04, 'epoch': 0.03}
  3%|█▌                                            | 8/232 [17:19<6:49:00, 109.56s/it]  4%|█▊                                            | 9/232 [19:03<6:40:56, 107.88s/it]                                                                                      {'loss': 2.6133, 'grad_norm': 0.3281920254230499, 'learning_rate': 9.995691082675908e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 502.32, 'epoch': 0.04}
  4%|█▊                                            | 9/232 [19:03<6:40:56, 107.88s/it]  4%|█▉                                           | 10/232 [20:49<6:36:36, 107.19s/it]                                                                                      {'loss': 2.4171, 'grad_norm': 0.3485400676727295, 'learning_rate': 9.99234055839652e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 557.48, 'epoch': 0.04}
  4%|█▉                                           | 10/232 [20:49<6:36:36, 107.19s/it]  5%|██▏                                          | 11/232 [22:33<6:31:21, 106.25s/it]                                                                                      {'loss': 2.5725, 'grad_norm': 0.337157666683197, 'learning_rate': 9.988033841683694e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 588.5, 'epoch': 0.05}
  5%|██▏                                          | 11/232 [22:33<6:31:21, 106.25s/it]  5%|██▎                                          | 12/232 [24:18<6:27:53, 105.79s/it]                                                                                      {'loss': 2.563, 'grad_norm': 0.3561720848083496, 'learning_rate': 9.982771757411032e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.06, 'epoch': 0.05}
  5%|██▎                                          | 12/232 [24:18<6:27:53, 105.79s/it]  6%|██▌                                          | 13/232 [26:02<6:24:17, 105.28s/it]                                                                                      {'loss': 2.4225, 'grad_norm': 0.3262440264225006, 'learning_rate': 9.97655531343549e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 591.93, 'epoch': 0.06}
  6%|██▌                                          | 13/232 [26:02<6:24:17, 105.28s/it]  6%|██▋                                          | 14/232 [27:47<6:22:07, 105.17s/it]                                                                                      {'loss': 2.4692, 'grad_norm': 0.3001886308193207, 'learning_rate': 9.969385700404346e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.72, 'epoch': 0.06}
  6%|██▋                                          | 14/232 [27:47<6:22:07, 105.17s/it]  6%|██▉                                          | 15/232 [29:32<6:20:09, 105.11s/it]                                                                                      {'loss': 2.4818, 'grad_norm': 0.26973697543144226, 'learning_rate': 9.96126429152715e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.28, 'epoch': 0.06}
  6%|██▉                                          | 15/232 [29:32<6:20:09, 105.11s/it]  7%|███                                          | 16/232 [31:16<6:17:40, 104.91s/it]                                                                                      {'loss': 2.7253, 'grad_norm': 0.279231458902359, 'learning_rate': 9.952192642312713e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.53, 'epoch': 0.07}
  7%|███                                          | 16/232 [31:16<6:17:40, 104.91s/it]  7%|███▎                                         | 17/232 [33:01<6:15:58, 104.92s/it]                                                                                      {'loss': 2.5233, 'grad_norm': 0.2751440405845642, 'learning_rate': 9.942172490271169e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.47, 'epoch': 0.07}
  7%|███▎                                         | 17/232 [33:01<6:15:58, 104.92s/it]  8%|███▍                                         | 18/232 [34:46<6:13:59, 104.86s/it]                                                                                      {'loss': 2.3993, 'grad_norm': 0.2261095941066742, 'learning_rate': 9.931205754581203e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 590.3, 'epoch': 0.08}
  8%|███▍                                         | 18/232 [34:46<6:13:59, 104.86s/it]  8%|███▋                                         | 19/232 [36:30<6:11:38, 104.69s/it]                                                                                      {'loss': 2.451, 'grad_norm': 0.2214576005935669, 'learning_rate': 9.919294535722452e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.38, 'epoch': 0.08}
  8%|███▋                                         | 19/232 [36:30<6:11:38, 104.69s/it]  9%|███▉                                         | 20/232 [38:15<6:10:28, 104.85s/it]                                                                                      {'loss': 2.4476, 'grad_norm': 0.22921393811702728, 'learning_rate': 9.9064411150732e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.55, 'epoch': 0.09}
  9%|███▉                                         | 20/232 [38:15<6:10:28, 104.85s/it]  9%|████                                         | 21/232 [40:00<6:08:21, 104.74s/it]                                                                                      {'loss': 2.5764, 'grad_norm': 0.2488836944103241, 'learning_rate': 9.892647954473425e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.2, 'epoch': 0.09}
  9%|████                                         | 21/232 [40:00<6:08:21, 104.74s/it]  9%|████▎                                        | 22/232 [41:44<6:06:26, 104.70s/it]                                                                                      {'loss': 2.4462, 'grad_norm': 0.21848219633102417, 'learning_rate': 9.877917695753275e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 547.83, 'epoch': 0.09}
  9%|████▎                                        | 22/232 [41:44<6:06:26, 104.70s/it] 10%|████▍                                        | 23/232 [43:29<6:04:18, 104.59s/it]                                                                                      {'loss': 2.4739, 'grad_norm': 0.21689197421073914, 'learning_rate': 9.862253160227077e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.87, 'epoch': 0.1}
 10%|████▍                                        | 23/232 [43:29<6:04:18, 104.59s/it] 10%|████▋                                        | 24/232 [45:14<6:03:49, 104.95s/it]                                                                                      {'loss': 2.5701, 'grad_norm': 0.24574564397335052, 'learning_rate': 9.845657348152958e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.54, 'epoch': 0.1}
 10%|████▋                                        | 24/232 [45:14<6:03:49, 104.95s/it][2025-10-07 12:36:33,154] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 12:36:35,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0339405536651611
[2025-10-07 12:36:36,484] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0328030586242676
[2025-10-07 12:36:37,528] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0433502197265625
[2025-10-07 12:36:38,568] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.039651870727539
[2025-10-07 12:36:38,569] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.75s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.65s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.22s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.55s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.74s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:57,  7.86s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.01s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.05s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.07s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:29<01:28,  8.09s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.17s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.16s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.15s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.89s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:47,  7.99s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.10s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:42<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.17s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.18s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.468449831008911, 'eval_runtime': 187.9758, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.1}
 10%|████▋                                        | 24/232 [48:28<6:03:49, 104.95s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.18s/it][A
                                                                                      [A[2025-10-07 12:39:46,575] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 12:39:57,444] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-24
 11%|████▊                                        | 25/232 [50:39<9:48:59, 170.72s/it]                                                                                      {'loss': 2.5522, 'grad_norm': 0.2543272078037262, 'learning_rate': 9.828133438158206e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 588.57, 'epoch': 0.11}
 11%|████▊                                        | 25/232 [50:39<9:48:59, 170.72s/it] 11%|█████                                        | 26/232 [52:24<8:38:51, 151.12s/it]                                                                                      {'loss': 2.3874, 'grad_norm': 0.2061944603919983, 'learning_rate': 9.809684786630462e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 575.63, 'epoch': 0.11}
 11%|█████                                        | 26/232 [52:24<8:38:51, 151.12s/it] 12%|█████▏                                       | 27/232 [54:09<7:49:00, 137.27s/it]                                                                                      {'loss': 2.4434, 'grad_norm': 0.21916723251342773, 'learning_rate': 9.79031492707486e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.46, 'epoch': 0.12}
 12%|█████▏                                       | 27/232 [54:09<7:49:00, 137.27s/it] 12%|█████▍                                       | 28/232 [55:53<7:13:08, 127.40s/it]                                                                                      {'loss': 2.4526, 'grad_norm': 0.21004539728164673, 'learning_rate': 9.770027569437252e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.07, 'epoch': 0.12}
 12%|█████▍                                       | 28/232 [55:53<7:13:08, 127.40s/it] 12%|█████▋                                       | 29/232 [57:38<6:48:02, 120.61s/it]                                                                                      {'loss': 2.462, 'grad_norm': 0.21658752858638763, 'learning_rate': 9.748826599393632e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 541.42, 'epoch': 0.12}
 12%|█████▋                                       | 29/232 [57:38<6:48:02, 120.61s/it] 13%|█████▊                                       | 30/232 [59:24<6:31:02, 116.15s/it]                                                                                      {'loss': 2.3799, 'grad_norm': 0.20760858058929443, 'learning_rate': 9.72671607760591e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 601.63, 'epoch': 0.13}
 13%|█████▊                                       | 30/232 [59:24<6:31:02, 116.15s/it] 13%|█████▋                                     | 31/232 [1:01:08<6:17:07, 112.57s/it]                                                                                      {'loss': 2.384, 'grad_norm': 0.19674667716026306, 'learning_rate': 9.703700238944157e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 533.46, 'epoch': 0.13}
 13%|█████▋                                     | 31/232 [1:01:08<6:17:07, 112.57s/it] 14%|█████▉                                     | 32/232 [1:02:53<6:07:29, 110.25s/it]                                                                                      {'loss': 2.5231, 'grad_norm': 0.17808720469474792, 'learning_rate': 9.679783491675507e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 609.74, 'epoch': 0.14}
 14%|█████▉                                     | 32/232 [1:02:53<6:07:29, 110.25s/it] 14%|██████                                     | 33/232 [1:04:37<5:59:49, 108.49s/it]                                                                                      {'loss': 2.4602, 'grad_norm': 0.20862546563148499, 'learning_rate': 9.654970416619814e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.45, 'epoch': 0.14}
 14%|██████                                     | 33/232 [1:04:37<5:59:49, 108.49s/it] 15%|██████▎                                    | 34/232 [1:06:22<5:54:24, 107.40s/it]                                                                                      {'loss': 2.5308, 'grad_norm': 0.19285555183887482, 'learning_rate': 9.629265766272293e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 589.77, 'epoch': 0.15}
 15%|██████▎                                    | 34/232 [1:06:22<5:54:24, 107.40s/it] 15%|██████▍                                    | 35/232 [1:08:07<5:50:22, 106.72s/it]                                                                                      {'loss': 2.4742, 'grad_norm': 0.2334047555923462, 'learning_rate': 9.602674463893266e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.45, 'epoch': 0.15}
 15%|██████▍                                    | 35/232 [1:08:07<5:50:22, 106.72s/it] 16%|██████▋                                    | 36/232 [1:09:52<5:46:15, 106.00s/it]                                                                                      {'loss': 2.6276, 'grad_norm': 0.2094365656375885, 'learning_rate': 9.575201602565192e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.7, 'epoch': 0.15}
 16%|██████▋                                    | 36/232 [1:09:52<5:46:15, 106.00s/it] 16%|██████▊                                    | 37/232 [1:11:36<5:43:15, 105.62s/it]                                                                                      {'loss': 2.3691, 'grad_norm': 0.1700724959373474, 'learning_rate': 9.54685244421718e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 573.17, 'epoch': 0.16}
 16%|██████▊                                    | 37/232 [1:11:36<5:43:15, 105.62s/it] 16%|███████                                    | 38/232 [1:13:21<5:40:47, 105.40s/it]                                                                                      {'loss': 2.5592, 'grad_norm': 0.3765801787376404, 'learning_rate': 9.517632418617173e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.1, 'epoch': 0.16}
 16%|███████                                    | 38/232 [1:13:21<5:40:47, 105.40s/it] 17%|███████▏                                   | 39/232 [1:15:05<5:37:51, 105.04s/it]                                                                                      {'loss': 2.3773, 'grad_norm': 0.2009992152452469, 'learning_rate': 9.487547122331965e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 566.78, 'epoch': 0.17}
 17%|███████▏                                   | 39/232 [1:15:05<5:37:51, 105.04s/it] 17%|███████▍                                   | 40/232 [1:16:51<5:36:50, 105.26s/it]                                                                                      {'loss': 2.4833, 'grad_norm': 0.22405098378658295, 'learning_rate': 9.456602317655274e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 605.79, 'epoch': 0.17}
 17%|███████▍                                   | 40/232 [1:16:51<5:36:50, 105.26s/it] 18%|███████▌                                   | 41/232 [1:18:36<5:34:21, 105.03s/it]                                                                                      {'loss': 2.4914, 'grad_norm': 0.18948987126350403, 'learning_rate': 9.424803931504095e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.18, 'epoch': 0.18}
 18%|███████▌                                   | 41/232 [1:18:36<5:34:21, 105.03s/it] 18%|███████▊                                   | 42/232 [1:20:21<5:32:32, 105.02s/it]                                                                                      {'loss': 2.4371, 'grad_norm': 0.16337323188781738, 'learning_rate': 9.392158054283497e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.47, 'epoch': 0.18}
 18%|███████▊                                   | 42/232 [1:20:21<5:32:32, 105.02s/it] 19%|███████▉                                   | 43/232 [1:22:05<5:30:04, 104.78s/it]                                                                                      {'loss': 2.4772, 'grad_norm': 0.17447726428508759, 'learning_rate': 9.358670938720114e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.39, 'epoch': 0.18}
 19%|███████▉                                   | 43/232 [1:22:05<5:30:04, 104.78s/it] 19%|████████▏                                  | 44/232 [1:23:49<5:27:50, 104.63s/it]                                                                                      {'loss': 2.4562, 'grad_norm': 0.1712619960308075, 'learning_rate': 9.32434899866455e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 557.26, 'epoch': 0.19}
 19%|████████▏                                  | 44/232 [1:23:49<5:27:50, 104.63s/it] 19%|████████▎                                  | 45/232 [1:25:34<5:26:24, 104.73s/it]                                                                                      {'loss': 2.4868, 'grad_norm': 0.1861104965209961, 'learning_rate': 9.289198807862929e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.39, 'epoch': 0.19}
 19%|████████▎                                  | 45/232 [1:25:34<5:26:24, 104.73s/it] 20%|████████▌                                  | 46/232 [1:27:19<5:24:25, 104.65s/it]                                                                                      {'loss': 2.6399, 'grad_norm': 0.19723129272460938, 'learning_rate': 9.253227098697804e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.28, 'epoch': 0.2}
 20%|████████▌                                  | 46/232 [1:27:19<5:24:25, 104.65s/it] 20%|████████▋                                  | 47/232 [1:29:04<5:23:09, 104.81s/it]                                                                                      {'loss': 2.5103, 'grad_norm': 0.18864890933036804, 'learning_rate': 9.216440760898695e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.2, 'epoch': 0.2}
 20%|████████▋                                  | 47/232 [1:29:04<5:23:09, 104.81s/it] 21%|████████▉                                  | 48/232 [1:30:50<5:22:25, 105.14s/it]                                                                                      {'loss': 2.4005, 'grad_norm': 0.1855887770652771, 'learning_rate': 9.178846840222489e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.83, 'epoch': 0.21}
 21%|████████▉                                  | 48/232 [1:30:50<5:22:25, 105.14s/it][2025-10-07 13:22:08,383] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 13:22:10,654] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0294299125671387
[2025-10-07 13:22:11,684] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0294687747955322
[2025-10-07 13:22:12,715] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0308668613433838
[2025-10-07 13:22:13,749] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.033970594406128
[2025-10-07 13:22:13,751] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.75s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.66s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.22s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.55s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.74s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:57,  7.86s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.01s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.05s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:29<01:29,  8.10s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.17s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.16s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.89s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:47,  8.00s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.10s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:42<00:16,  8.16s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.17s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.440826654434204, 'eval_runtime': 188.032, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.21}
 21%|████████▉                                  | 48/232 [1:34:03<5:22:25, 105.14s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A
                                                                                      [A[2025-10-07 13:25:21,811] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 13:25:32,526] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-48
 21%|█████████                                  | 49/232 [1:36:14<8:41:09, 170.87s/it]                                                                                      {'loss': 2.3372, 'grad_norm': 0.15217913687229156, 'learning_rate': 9.140452537103943e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 628.86, 'epoch': 0.21}
 21%|█████████                                  | 49/232 [1:36:14<8:41:09, 170.87s/it] 22%|█████████▎                                 | 50/232 [1:38:00<7:39:21, 151.44s/it]                                                                                      {'loss': 2.4017, 'grad_norm': 0.15943582355976105, 'learning_rate': 9.101265205276581e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 563.54, 'epoch': 0.22}
 22%|█████████▎                                 | 50/232 [1:38:00<7:39:21, 151.44s/it] 22%|█████████▍                                 | 51/232 [1:39:45<6:54:29, 137.40s/it]                                                                                      {'loss': 2.3504, 'grad_norm': 0.21394406259059906, 'learning_rate': 9.061292350364222e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.33, 'epoch': 0.22}
 22%|█████████▍                                 | 51/232 [1:39:45<6:54:29, 137.40s/it] 22%|█████████▋                                 | 52/232 [1:41:30<6:23:05, 127.69s/it]                                                                                      {'loss': 2.3518, 'grad_norm': 0.16352114081382751, 'learning_rate': 9.020541628443395e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.89, 'epoch': 0.22}
 22%|█████████▋                                 | 52/232 [1:41:30<6:23:05, 127.69s/it] 23%|█████████▊                                 | 53/232 [1:43:14<5:59:58, 120.66s/it]                                                                                      {'loss': 2.4803, 'grad_norm': 0.24781003594398499, 'learning_rate': 8.979020844576982e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 573.01, 'epoch': 0.23}
 23%|█████████▊                                 | 53/232 [1:43:14<5:59:58, 120.66s/it] 23%|██████████                                 | 54/232 [1:44:59<5:43:55, 115.93s/it]                                                                                      {'loss': 2.2747, 'grad_norm': 0.1854228526353836, 'learning_rate': 8.936737951319276e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.86, 'epoch': 0.23}
 23%|██████████                                 | 54/232 [1:44:59<5:43:55, 115.93s/it] 24%|██████████▏                                | 55/232 [1:46:44<5:32:07, 112.59s/it]                                                                                      {'loss': 2.4225, 'grad_norm': 0.22357748448848724, 'learning_rate': 8.893701047192832e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.98, 'epoch': 0.24}
 24%|██████████▏                                | 55/232 [1:46:44<5:32:07, 112.59s/it] 24%|██████████▍                                | 56/232 [1:48:28<5:23:05, 110.14s/it]                                                                                      {'loss': 2.2967, 'grad_norm': 0.14629872143268585, 'learning_rate': 8.84991837513733e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.51, 'epoch': 0.24}
 24%|██████████▍                                | 56/232 [1:48:28<5:23:05, 110.14s/it] 25%|██████████▌                                | 57/232 [1:50:13<5:16:41, 108.58s/it]                                                                                      {'loss': 2.4355, 'grad_norm': 0.18156284093856812, 'learning_rate': 8.805398320930792e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.09, 'epoch': 0.25}
 25%|██████████▌                                | 57/232 [1:50:13<5:16:41, 108.58s/it] 25%|██████████▊                                | 58/232 [1:51:58<5:11:49, 107.53s/it]                                                                                      {'loss': 2.4022, 'grad_norm': 0.21640124917030334, 'learning_rate': 8.760149411583436e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.78, 'epoch': 0.25}
 25%|██████████▊                                | 58/232 [1:51:58<5:11:49, 107.53s/it] 25%|██████████▉                                | 59/232 [1:53:43<5:07:28, 106.64s/it]                                                                                      {'loss': 2.5751, 'grad_norm': 0.19623495638370514, 'learning_rate': 8.71418031370449e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.73, 'epoch': 0.25}
 25%|██████████▉                                | 59/232 [1:53:43<5:07:28, 106.64s/it] 26%|███████████                                | 60/232 [1:55:29<5:05:06, 106.43s/it]                                                                                      {'loss': 2.3643, 'grad_norm': 0.16625775396823883, 'learning_rate': 8.667499831842252e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.64, 'epoch': 0.26}
 26%|███████████                                | 60/232 [1:55:29<5:05:06, 106.43s/it] 26%|███████████▎                               | 61/232 [1:57:13<5:01:41, 105.86s/it]                                                                                      {'loss': 2.5068, 'grad_norm': 0.16208383440971375, 'learning_rate': 8.62011690679774e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.28, 'epoch': 0.26}
 26%|███████████▎                               | 61/232 [1:57:13<5:01:41, 105.86s/it] 27%|███████████▍                               | 62/232 [1:58:58<4:59:07, 105.57s/it]                                                                                      {'loss': 2.5253, 'grad_norm': 0.15697439014911652, 'learning_rate': 8.572040613912241e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 492.93, 'epoch': 0.27}
 27%|███████████▍                               | 62/232 [1:58:58<4:59:07, 105.57s/it] 27%|███████████▋                               | 63/232 [2:00:43<4:56:30, 105.27s/it]                                                                                      {'loss': 2.554, 'grad_norm': 0.2069503366947174, 'learning_rate': 8.5232801613291e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.83, 'epoch': 0.27}
 27%|███████████▋                               | 63/232 [2:00:43<4:56:30, 105.27s/it] 28%|███████████▊                               | 64/232 [2:02:27<4:54:27, 105.16s/it]                                                                                      {'loss': 2.4899, 'grad_norm': 0.1628636121749878, 'learning_rate': 8.473844888230065e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 565.15, 'epoch': 0.28}
 28%|███████████▊                               | 64/232 [2:02:27<4:54:27, 105.16s/it] 28%|████████████                               | 65/232 [2:04:12<4:52:23, 105.05s/it]                                                                                      {'loss': 2.4041, 'grad_norm': 0.16927234828472137, 'learning_rate': 8.42374426304653e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 585.84, 'epoch': 0.28}
 28%|████████████                               | 65/232 [2:04:12<4:52:23, 105.05s/it] 28%|████████████▏                              | 66/232 [2:05:55<4:48:48, 104.39s/it]                                                                                      {'loss': 2.5481, 'grad_norm': 0.16745679080486298, 'learning_rate': 8.372987881646036e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 630.04, 'epoch': 0.28}
 28%|████████████▏                              | 66/232 [2:05:55<4:48:48, 104.39s/it] 29%|████████████▍                              | 67/232 [2:07:40<4:47:28, 104.53s/it]                                                                                      {'loss': 2.3923, 'grad_norm': 0.15913288295269012, 'learning_rate': 8.32158546549435e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 605.21, 'epoch': 0.29}
 29%|████████████▍                              | 67/232 [2:07:40<4:47:28, 104.53s/it] 29%|████████████▌                              | 68/232 [2:09:25<4:45:52, 104.59s/it]                                                                                      {'loss': 2.5484, 'grad_norm': 0.16244389116764069, 'learning_rate': 8.269546859793499e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 475.22, 'epoch': 0.29}
 29%|████████████▌                              | 68/232 [2:09:25<4:45:52, 104.59s/it] 30%|████████████▊                              | 69/232 [2:11:09<4:43:56, 104.52s/it]                                                                                      {'loss': 2.3085, 'grad_norm': 0.14511074125766754, 'learning_rate': 8.216882031596098e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.67, 'epoch': 0.3}
 30%|████████████▊                              | 69/232 [2:11:09<4:43:56, 104.52s/it] 30%|████████████▉                              | 70/232 [2:12:55<4:43:37, 105.04s/it]                                                                                      {'loss': 2.5241, 'grad_norm': 0.16853027045726776, 'learning_rate': 8.163601067896344e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.74, 'epoch': 0.3}
 30%|████████████▉                              | 70/232 [2:12:55<4:43:37, 105.04s/it] 31%|█████████████▏                             | 71/232 [2:14:40<4:41:28, 104.90s/it]                                                                                      {'loss': 2.3546, 'grad_norm': 0.13500231504440308, 'learning_rate': 8.109714173698027e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.86, 'epoch': 0.31}
 31%|█████████████▏                             | 71/232 [2:14:40<4:41:28, 104.90s/it] 31%|█████████████▎                             | 72/232 [2:16:26<4:40:27, 105.17s/it]                                                                                      {'loss': 2.358, 'grad_norm': 0.1682259440422058, 'learning_rate': 8.055231670059958e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.33, 'epoch': 0.31}
 31%|█████████████▎                             | 72/232 [2:16:26<4:40:27, 105.17s/it][2025-10-07 14:07:44,423] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 14:07:46,706] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0272562503814697
[2025-10-07 14:07:47,758] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0513060092926025
[2025-10-07 14:07:48,796] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.038541555404663
[2025-10-07 14:07:49,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0161314010620117
[2025-10-07 14:07:49,814] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.07s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.76s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.67s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.23s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.75s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.87s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.05s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:29<01:29,  8.09s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.17s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.16s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.89s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.00s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:42<00:16,  8.18s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.4302256107330322, 'eval_runtime': 188.137, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.31}
 31%|█████████████▎                             | 72/232 [2:19:39<4:40:27, 105.17s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A
                                                                                      [A[2025-10-07 14:10:57,960] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 14:11:08,752] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-72
 31%|█████████████▌                             | 73/232 [2:21:50<7:32:42, 170.84s/it]                                                                                      {'loss': 2.5516, 'grad_norm': 0.37493956089019775, 'learning_rate': 8.000163992119146e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 539.21, 'epoch': 0.31}
 31%|█████████████▌                             | 73/232 [2:21:50<7:32:42, 170.84s/it] 32%|█████████████▋                             | 74/232 [2:23:35<6:38:13, 151.22s/it]                                                                                      {'loss': 2.3886, 'grad_norm': 0.15932448208332062, 'learning_rate': 7.944521687092143e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 627.95, 'epoch': 0.32}
 32%|█████████████▋                             | 74/232 [2:23:35<6:38:13, 151.22s/it] 32%|█████████████▉                             | 75/232 [2:25:20<5:59:32, 137.40s/it]                                                                                      {'loss': 2.5004, 'grad_norm': 0.1597600132226944, 'learning_rate': 7.888315412254921e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.22, 'epoch': 0.32}
 32%|█████████████▉                             | 75/232 [2:25:20<5:59:32, 137.40s/it] 33%|██████████████                             | 76/232 [2:27:02<5:29:12, 126.62s/it]                                                                                      {'loss': 2.4302, 'grad_norm': 0.15421240031719208, 'learning_rate': 7.831555932901642e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.04, 'epoch': 0.33}
 33%|██████████████                             | 76/232 [2:27:02<5:29:12, 126.62s/it] 33%|██████████████▎                            | 77/232 [2:28:47<5:10:30, 120.20s/it]                                                                                      {'loss': 2.7413, 'grad_norm': 0.18825848400592804, 'learning_rate': 7.774254120282792e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.97, 'epoch': 0.33}
 33%|██████████████▎                            | 77/232 [2:28:47<5:10:30, 120.20s/it] 34%|██████████████▍                            | 78/232 [2:30:32<4:56:50, 115.65s/it]                                                                                      {'loss': 2.4924, 'grad_norm': 0.18012621998786926, 'learning_rate': 7.71642094952296e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 537.21, 'epoch': 0.34}
 34%|██████████████▍                            | 78/232 [2:30:32<4:56:50, 115.65s/it] 34%|██████████████▋                            | 79/232 [2:32:16<4:46:13, 112.24s/it]                                                                                      {'loss': 2.3216, 'grad_norm': 0.15518777072429657, 'learning_rate': 7.658067497518773e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 442.25, 'epoch': 0.34}
 34%|██████████████▋                            | 79/232 [2:32:16<4:46:13, 112.24s/it] 34%|██████████████▊                            | 80/232 [2:34:02<4:39:28, 110.32s/it]                                                                                      {'loss': 2.4516, 'grad_norm': 0.1714549958705902, 'learning_rate': 7.599204940817309e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 482.12, 'epoch': 0.34}
 34%|██████████████▊                            | 80/232 [2:34:02<4:39:28, 110.32s/it] 35%|███████████████                            | 81/232 [2:35:47<4:33:26, 108.65s/it]                                                                                      {'loss': 2.4457, 'grad_norm': 0.1730002611875534, 'learning_rate': 7.539844553475427e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.62, 'epoch': 0.35}
 35%|███████████████                            | 81/232 [2:35:47<4:33:26, 108.65s/it] 35%|███████████████▏                           | 82/232 [2:37:32<4:28:49, 107.53s/it]                                                                                      {'loss': 2.3308, 'grad_norm': 0.14245469868183136, 'learning_rate': 7.479997704900437e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 556.71, 'epoch': 0.35}
 35%|███████████████▏                           | 82/232 [2:37:32<4:28:49, 107.53s/it] 36%|███████████████▍                           | 83/232 [2:39:16<4:24:40, 106.58s/it]                                                                                      {'loss': 2.3798, 'grad_norm': 0.15552735328674316, 'learning_rate': 7.4196758576724835e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 569.3, 'epoch': 0.36}
 36%|███████████████▍                           | 83/232 [2:39:16<4:24:40, 106.58s/it] 36%|███████████████▌                           | 84/232 [2:41:01<4:21:54, 106.18s/it]                                                                                      {'loss': 2.5172, 'grad_norm': 0.1897697001695633, 'learning_rate': 7.358890565349106e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.77, 'epoch': 0.36}
 36%|███████████████▌                           | 84/232 [2:41:01<4:21:54, 106.18s/it] 37%|███████████████▊                           | 85/232 [2:42:47<4:19:20, 105.85s/it]                                                                                      {'loss': 2.4283, 'grad_norm': 0.17907385528087616, 'learning_rate': 7.297653470252359e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.1, 'epoch': 0.37}
 37%|███████████████▊                           | 85/232 [2:42:47<4:19:20, 105.85s/it] 37%|███████████████▉                           | 86/232 [2:44:31<4:16:38, 105.47s/it]                                                                                      {'loss': 2.44, 'grad_norm': 0.16645929217338562, 'learning_rate': 7.235976301238933e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.75, 'epoch': 0.37}
 37%|███████████████▉                           | 86/232 [2:44:31<4:16:38, 105.47s/it] 38%|████████████████▏                          | 87/232 [2:46:16<4:14:33, 105.33s/it]                                                                                      {'loss': 2.3995, 'grad_norm': 0.18987996876239777, 'learning_rate': 7.1738708714537165e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.08, 'epoch': 0.37}
 38%|████████████████▏                          | 87/232 [2:46:16<4:14:33, 105.33s/it] 38%|████████████████▎                          | 88/232 [2:48:01<4:12:24, 105.17s/it]                                                                                      {'loss': 2.4353, 'grad_norm': 0.3808911442756653, 'learning_rate': 7.111349076067186e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 521.64, 'epoch': 0.38}
 38%|████████████████▎                          | 88/232 [2:48:01<4:12:24, 105.17s/it] 38%|████████████████▍                          | 89/232 [2:49:45<4:10:07, 104.95s/it]                                                                                      {'loss': 2.5919, 'grad_norm': 0.1509799361228943, 'learning_rate': 7.048422889997115e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 545.32, 'epoch': 0.38}
 38%|████████████████▍                          | 89/232 [2:49:45<4:10:07, 104.95s/it] 39%|████████████████▋                          | 90/232 [2:51:31<4:08:59, 105.21s/it]                                                                                      {'loss': 2.483, 'grad_norm': 0.15931715071201324, 'learning_rate': 6.985104365614987e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 606.25, 'epoch': 0.39}
 39%|████████████████▋                          | 90/232 [2:51:31<4:08:59, 105.21s/it] 39%|████████████████▊                          | 91/232 [2:53:16<4:06:52, 105.05s/it]                                                                                      {'loss': 2.4388, 'grad_norm': 0.1680772304534912, 'learning_rate': 6.921405630437585e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.99, 'epoch': 0.39}
 39%|████████████████▊                          | 91/232 [2:53:16<4:06:52, 105.05s/it] 40%|█████████████████                          | 92/232 [2:55:01<4:05:06, 105.04s/it]                                                                                      {'loss': 2.4395, 'grad_norm': 0.1529005765914917, 'learning_rate': 6.857338884804185e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 528.13, 'epoch': 0.4}
 40%|█████████████████                          | 92/232 [2:55:01<4:05:06, 105.04s/it] 40%|█████████████████▏                         | 93/232 [2:56:42<4:00:21, 103.75s/it]                                                                                      {'loss': 2.3016, 'grad_norm': 0.27351638674736023, 'learning_rate': 6.792916399539805e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 477.83, 'epoch': 0.4}
 40%|█████████████████▏                         | 93/232 [2:56:42<4:00:21, 103.75s/it] 41%|█████████████████▍                         | 94/232 [2:58:27<3:59:37, 104.18s/it]                                                                                      {'loss': 2.3479, 'grad_norm': 0.1746247112751007, 'learning_rate': 6.728150513604942e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 551.33, 'epoch': 0.4}
 41%|█████████████████▍                         | 94/232 [2:58:27<3:59:37, 104.18s/it] 41%|█████████████████▌                         | 95/232 [3:00:09<3:56:41, 103.66s/it]                                                                                      {'loss': 2.477, 'grad_norm': 0.16394348442554474, 'learning_rate': 6.663053631732279e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 631.79, 'epoch': 0.41}
 41%|█████████████████▌                         | 95/232 [3:00:09<3:56:41, 103.66s/it] 41%|█████████████████▊                         | 96/232 [3:01:54<3:55:53, 104.07s/it]                                                                                      {'loss': 2.2869, 'grad_norm': 0.1481611430644989, 'learning_rate': 6.597638222050773e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 390.53, 'epoch': 0.41}
 41%|█████████████████▊                         | 96/232 [3:01:54<3:55:53, 104.07s/it][2025-10-07 14:53:13,026] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 14:53:15,597] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1568572521209717
[2025-10-07 14:53:16,769] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1719346046447754
[2025-10-07 14:53:17,955] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.185570478439331
[2025-10-07 14:53:19,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1617960929870605
[2025-10-07 14:53:19,118] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.75s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.66s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.23s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.74s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.87s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.05s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:29<01:29,  8.10s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.17s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.16s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:47,  8.00s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:42<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.423957347869873, 'eval_runtime': 188.1087, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.41}
 41%|█████████████████▊                         | 96/232 [3:05:08<3:55:53, 104.07s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A
                                                                                      [A[2025-10-07 14:56:27,236] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 14:56:37,557] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-96
 42%|█████████████████▉                         | 97/232 [3:07:19<6:23:07, 170.28s/it]                                                                                      {'loss': 2.3912, 'grad_norm': 0.15088680386543274, 'learning_rate': 6.5319168136976155e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 615.67, 'epoch': 0.42}
 42%|█████████████████▉                         | 97/232 [3:07:19<6:23:07, 170.28s/it] 42%|██████████████████▏                        | 98/232 [3:09:04<5:36:40, 150.75s/it]                                                                                      {'loss': 2.3082, 'grad_norm': 0.15012118220329285, 'learning_rate': 6.465901994418505e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 574.1, 'epoch': 0.42}
 42%|██████████████████▏                        | 98/232 [3:09:04<5:36:40, 150.75s/it] 43%|██████████████████▎                        | 99/232 [3:10:49<5:03:43, 137.02s/it]                                                                                      {'loss': 2.6214, 'grad_norm': 0.17665165662765503, 'learning_rate': 6.399606408156688e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 603.33, 'epoch': 0.43}
 43%|██████████████████▎                        | 99/232 [3:10:49<5:03:43, 137.02s/it] 43%|██████████████████                        | 100/232 [3:12:35<4:40:31, 127.51s/it]                                                                                      {'loss': 2.3682, 'grad_norm': 0.1719701886177063, 'learning_rate': 6.333042752631243e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 583.49, 'epoch': 0.43}
 43%|██████████████████                        | 100/232 [3:12:35<4:40:31, 127.51s/it] 44%|██████████████████▎                       | 101/232 [3:14:19<4:23:28, 120.68s/it]                                                                                      {'loss': 2.5312, 'grad_norm': 0.15609215199947357, 'learning_rate': 6.266223776905062e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.84, 'epoch': 0.43}
 44%|██████████████████▎                       | 101/232 [3:14:19<4:23:28, 120.68s/it] 44%|██████████████████▍                       | 102/232 [3:16:04<4:11:21, 116.01s/it]                                                                                      {'loss': 2.3856, 'grad_norm': 0.18730811774730682, 'learning_rate': 6.199162278942997e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.95, 'epoch': 0.44}
 44%|██████████████████▍                       | 102/232 [3:16:04<4:11:21, 116.01s/it] 44%|██████████████████▋                       | 103/232 [3:17:49<4:02:00, 112.56s/it]                                                                                      {'loss': 2.563, 'grad_norm': 0.1721767634153366, 'learning_rate': 6.131871103160644e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.79, 'epoch': 0.44}
 44%|██████████████████▋                       | 103/232 [3:17:49<4:02:00, 112.56s/it] 45%|██████████████████▊                       | 104/232 [3:19:34<3:55:23, 110.34s/it]                                                                                      {'loss': 2.3592, 'grad_norm': 0.20805968344211578, 'learning_rate': 6.064363137964225e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.17, 'epoch': 0.45}
 45%|██████████████████▊                       | 104/232 [3:19:34<3:55:23, 110.34s/it] 45%|███████████████████                       | 105/232 [3:21:19<3:50:13, 108.77s/it]                                                                                      {'loss': 2.4238, 'grad_norm': 0.16840516030788422, 'learning_rate': 5.996651313282051e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.25, 'epoch': 0.45}
 45%|███████████████████                       | 105/232 [3:21:19<3:50:13, 108.77s/it] 46%|███████████████████▏                      | 106/232 [3:23:04<3:45:50, 107.55s/it]                                                                                      {'loss': 2.3565, 'grad_norm': 0.17043235898017883, 'learning_rate': 5.9287485980880245e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.98, 'epoch': 0.46}
 46%|███████████████████▏                      | 106/232 [3:23:04<3:45:50, 107.55s/it] 46%|███████████████████▎                      | 107/232 [3:24:49<3:42:28, 106.79s/it]                                                                                      {'loss': 2.2732, 'grad_norm': 0.15569466352462769, 'learning_rate': 5.860667997917668e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.04, 'epoch': 0.46}
 46%|███████████████████▎                      | 107/232 [3:24:49<3:42:28, 106.79s/it] 47%|███████████████████▌                      | 108/232 [3:26:34<3:39:40, 106.30s/it]                                                                                      {'loss': 2.4234, 'grad_norm': 0.1824769228696823, 'learning_rate': 5.792422552377153e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.93, 'epoch': 0.46}
 47%|███████████████████▌                      | 108/232 [3:26:34<3:39:40, 106.30s/it] 47%|███████████████████▋                      | 109/232 [3:28:18<3:36:43, 105.72s/it]                                                                                      {'loss': 2.4736, 'grad_norm': 0.17908412218093872, 'learning_rate': 5.724025332645794e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.4, 'epoch': 0.47}
 47%|███████████████████▋                      | 109/232 [3:28:18<3:36:43, 105.72s/it] 47%|███████████████████▉                      | 110/232 [3:29:59<3:32:02, 104.28s/it]                                                                                      {'loss': 2.3608, 'grad_norm': 0.14559771120548248, 'learning_rate': 5.655489438972503e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 584.43, 'epoch': 0.47}
 47%|███████████████████▉                      | 110/232 [3:29:59<3:32:02, 104.28s/it] 48%|████████████████████                      | 111/232 [3:31:44<3:30:26, 104.35s/it]                                                                                      {'loss': 2.3306, 'grad_norm': 0.14917968213558197, 'learning_rate': 5.586827998166678e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 552.58, 'epoch': 0.48}
 48%|████████████████████                      | 111/232 [3:31:44<3:30:26, 104.35s/it] 48%|████████████████████▎                     | 112/232 [3:33:29<3:29:00, 104.51s/it]                                                                                      {'loss': 2.432, 'grad_norm': 0.20721513032913208, 'learning_rate': 5.518054161083994e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 515.93, 'epoch': 0.48}
 48%|████████████████████▎                     | 112/232 [3:33:29<3:29:00, 104.51s/it] 49%|████████████████████▍                     | 113/232 [3:35:13<3:27:21, 104.55s/it]                                                                                      {'loss': 2.4911, 'grad_norm': 0.3023054897785187, 'learning_rate': 5.449181100107599e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.31, 'epoch': 0.49}
 49%|████████████████████▍                     | 113/232 [3:35:13<3:27:21, 104.55s/it] 49%|████████████████████▋                     | 114/232 [3:36:58<3:25:54, 104.70s/it]                                                                                      {'loss': 2.5054, 'grad_norm': 0.1722680777311325, 'learning_rate': 5.38022200662518e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 532.56, 'epoch': 0.49}
 49%|████████████████████▋                     | 114/232 [3:36:58<3:25:54, 104.70s/it] 50%|████████████████████▊                     | 115/232 [3:38:41<3:22:52, 104.03s/it]                                                                                      {'loss': 2.4839, 'grad_norm': 0.16150489449501038, 'learning_rate': 5.31119008850239e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 513.61, 'epoch': 0.49}
 50%|████████████████████▊                     | 115/232 [3:38:41<3:22:52, 104.03s/it] 50%|█████████████████████                     | 116/232 [3:40:25<3:21:20, 104.14s/it]                                                                                      {'loss': 2.3893, 'grad_norm': 0.22127696871757507, 'learning_rate': 5.242098567553133e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 588.46, 'epoch': 0.5}
 50%|█████████████████████                     | 116/232 [3:40:25<3:21:20, 104.14s/it] 50%|█████████████████████▏                    | 117/232 [3:42:10<3:20:05, 104.40s/it]                                                                                      {'loss': 2.2837, 'grad_norm': 0.15573346614837646, 'learning_rate': 5.1729606770071395e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.59, 'epoch': 0.5}
 50%|█████████████████████▏                    | 117/232 [3:42:10<3:20:05, 104.40s/it] 51%|█████████████████████▎                    | 118/232 [3:43:55<3:18:43, 104.59s/it]                                                                                      {'loss': 2.5217, 'grad_norm': 0.16544535756111145, 'learning_rate': 5.103789658975413e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.58, 'epoch': 0.51}
 51%|█████████████████████▎                    | 118/232 [3:43:55<3:18:43, 104.59s/it] 51%|█████████████████████▌                    | 119/232 [3:45:40<3:17:00, 104.60s/it]                                                                                      {'loss': 2.3913, 'grad_norm': 0.16304516792297363, 'learning_rate': 5.034598761913917e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.38, 'epoch': 0.51}
 51%|█████████████████████▌                    | 119/232 [3:45:40<3:17:00, 104.60s/it] 52%|█████████████████████▋                    | 120/232 [3:47:26<3:15:54, 104.95s/it]                                                                                      {'loss': 2.4939, 'grad_norm': 0.1516706794500351, 'learning_rate': 4.965401238086084e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 587.79, 'epoch': 0.52}
 52%|█████████████████████▋                    | 120/232 [3:47:26<3:15:54, 104.95s/it][2025-10-07 15:38:44,450] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 15:38:46,763] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0435214042663574
[2025-10-07 15:38:47,796] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.032888650894165
[2025-10-07 15:38:48,835] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0386509895324707
[2025-10-07 15:38:49,875] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0395262241363525
[2025-10-07 15:38:49,876] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.07s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.76s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.67s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.24s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.75s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.87s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.06s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:30<01:29,  8.10s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.17s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.16s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.00s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:43<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.4198384284973145, 'eval_runtime': 188.1898, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.52}
 52%|█████████████████████▋                    | 120/232 [3:50:39<3:15:54, 104.95s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A
                                                                                      [A[2025-10-07 15:41:58,075] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 15:42:08,817] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-120
 52%|█████████████████████▉                    | 121/232 [3:52:49<5:15:34, 170.58s/it]                                                                                      {'loss': 2.4623, 'grad_norm': 0.1619013249874115, 'learning_rate': 4.896210341024587e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 627.33, 'epoch': 0.52}
 52%|█████████████████████▉                    | 121/232 [3:52:49<5:15:34, 170.58s/it] 53%|██████████████████████                    | 122/232 [3:54:35<4:36:58, 151.08s/it]                                                                                      {'loss': 2.409, 'grad_norm': 0.18715393543243408, 'learning_rate': 4.827039322992861e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 584.4, 'epoch': 0.52}
 53%|██████████████████████                    | 122/232 [3:54:35<4:36:58, 151.08s/it] 53%|██████████████████████▎                   | 123/232 [3:56:20<4:09:29, 137.33s/it]                                                                                      {'loss': 2.3789, 'grad_norm': 0.16886143386363983, 'learning_rate': 4.75790143244687e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.91, 'epoch': 0.53}
 53%|██████████████████████▎                   | 123/232 [3:56:20<4:09:29, 137.33s/it] 53%|██████████████████████▍                   | 124/232 [3:58:02<3:47:52, 126.60s/it]                                                                                      {'loss': 2.5186, 'grad_norm': 0.1661851704120636, 'learning_rate': 4.68880991149761e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 630.15, 'epoch': 0.53}
 53%|██████████████████████▍                   | 124/232 [3:58:02<3:47:52, 126.60s/it] 54%|██████████████████████▋                   | 125/232 [3:59:47<3:34:17, 120.16s/it]                                                                                      {'loss': 2.4673, 'grad_norm': 0.29293861985206604, 'learning_rate': 4.6197779933748226e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.33, 'epoch': 0.54}
 54%|██████████████████████▋                   | 125/232 [3:59:47<3:34:17, 120.16s/it] 54%|██████████████████████▊                   | 126/232 [4:01:32<3:24:26, 115.72s/it]                                                                                      {'loss': 2.5636, 'grad_norm': 0.17951923608779907, 'learning_rate': 4.550818899892402e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.02, 'epoch': 0.54}
 54%|██████████████████████▊                   | 126/232 [4:01:32<3:24:26, 115.72s/it] 55%|██████████████████████▉                   | 127/232 [4:03:17<3:16:45, 112.43s/it]                                                                                      {'loss': 2.4493, 'grad_norm': 0.19470256567001343, 'learning_rate': 4.481945838916006e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.59, 'epoch': 0.55}
 55%|██████████████████████▉                   | 127/232 [4:03:17<3:16:45, 112.43s/it] 55%|███████████████████████▏                  | 128/232 [4:05:02<3:11:05, 110.25s/it]                                                                                      {'loss': 2.3728, 'grad_norm': 0.16387903690338135, 'learning_rate': 4.413172001833324e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.37, 'epoch': 0.55}
 55%|███████████████████████▏                  | 128/232 [4:05:02<3:11:05, 110.25s/it] 56%|███████████████████████▎                  | 129/232 [4:06:47<3:06:41, 108.75s/it]                                                                                      {'loss': 2.2864, 'grad_norm': 0.16799062490463257, 'learning_rate': 4.344510561027498e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.67, 'epoch': 0.55}
 56%|███████████████████████▎                  | 129/232 [4:06:47<3:06:41, 108.75s/it] 56%|███████████████████████▌                  | 130/232 [4:08:33<3:03:14, 107.79s/it]                                                                                      {'loss': 2.4157, 'grad_norm': 0.14640620350837708, 'learning_rate': 4.275974667354208e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 572.96, 'epoch': 0.56}
 56%|███████████████████████▌                  | 130/232 [4:08:33<3:03:14, 107.79s/it] 56%|███████████████████████▋                  | 131/232 [4:10:18<3:00:03, 106.96s/it]                                                                                      {'loss': 2.5456, 'grad_norm': 0.21564586460590363, 'learning_rate': 4.207577447622849e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.04, 'epoch': 0.56}
 56%|███████████████████████▋                  | 131/232 [4:10:18<3:00:03, 106.96s/it] 57%|███████████████████████▉                  | 132/232 [4:12:03<2:57:22, 106.42s/it]                                                                                      {'loss': 2.5254, 'grad_norm': 0.1728494018316269, 'learning_rate': 4.139332002082333e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 575.86, 'epoch': 0.57}
 57%|███████████████████████▉                  | 132/232 [4:12:03<2:57:22, 106.42s/it] 57%|████████████████████████                  | 133/232 [4:13:48<2:54:42, 105.88s/it]                                                                                      {'loss': 2.3662, 'grad_norm': 0.23733289539813995, 'learning_rate': 4.071251401911977e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.44, 'epoch': 0.57}
 57%|████████████████████████                  | 133/232 [4:13:48<2:54:42, 105.88s/it] 58%|████████████████████████▎                 | 134/232 [4:15:33<2:52:36, 105.68s/it]                                                                                      {'loss': 2.3741, 'grad_norm': 0.16287827491760254, 'learning_rate': 4.00334868671795e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 549.45, 'epoch': 0.58}
 58%|████████████████████████▎                 | 134/232 [4:15:33<2:52:36, 105.68s/it] 58%|████████████████████████▍                 | 135/232 [4:17:18<2:50:34, 105.51s/it]                                                                                      {'loss': 2.463, 'grad_norm': 0.19709622859954834, 'learning_rate': 3.935636862035776e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 539.3, 'epoch': 0.58}
 58%|████████████████████████▍                 | 135/232 [4:17:18<2:50:34, 105.51s/it] 59%|████████████████████████▌                 | 136/232 [4:19:03<2:48:16, 105.17s/it]                                                                                      {'loss': 2.3745, 'grad_norm': 0.1827632039785385, 'learning_rate': 3.868128896839357e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 597.28, 'epoch': 0.58}
 59%|████████████████████████▌                 | 136/232 [4:19:03<2:48:16, 105.17s/it] 59%|████████████████████████▊                 | 137/232 [4:20:48<2:46:31, 105.17s/it]                                                                                      {'loss': 2.4479, 'grad_norm': 0.19809529185295105, 'learning_rate': 3.8008377210570045e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 593.94, 'epoch': 0.59}
 59%|████████████████████████▊                 | 137/232 [4:20:48<2:46:31, 105.17s/it] 59%|████████████████████████▉                 | 138/232 [4:22:33<2:44:49, 105.20s/it]                                                                                      {'loss': 2.4662, 'grad_norm': 0.17152650654315948, 'learning_rate': 3.7337762230949397e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.54, 'epoch': 0.59}
 59%|████████████████████████▉                 | 138/232 [4:22:33<2:44:49, 105.20s/it] 60%|█████████████████████████▏                | 139/232 [4:24:18<2:42:50, 105.06s/it]                                                                                      {'loss': 2.3467, 'grad_norm': 0.16035908460617065, 'learning_rate': 3.6669572473687577e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.85, 'epoch': 0.6}
 60%|█████████████████████████▏                | 139/232 [4:24:18<2:42:50, 105.06s/it] 60%|█████████████████████████▎                | 140/232 [4:26:04<2:41:31, 105.34s/it]                                                                                      {'loss': 2.6583, 'grad_norm': 0.1990923434495926, 'learning_rate': 3.6003935918433124e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.49, 'epoch': 0.6}
 60%|█████████████████████████▎                | 140/232 [4:26:04<2:41:31, 105.34s/it] 61%|█████████████████████████▌                | 141/232 [4:27:48<2:39:28, 105.15s/it]                                                                                      {'loss': 2.4977, 'grad_norm': 0.17929832637310028, 'learning_rate': 3.534098005581497e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 615.64, 'epoch': 0.61}
 61%|█████████████████████████▌                | 141/232 [4:27:48<2:39:28, 105.15s/it] 61%|█████████████████████████▋                | 142/232 [4:29:33<2:37:40, 105.12s/it]                                                                                      {'loss': 2.604, 'grad_norm': 0.16450382769107819, 'learning_rate': 3.4680831863023866e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.82, 'epoch': 0.61}
 61%|█████████████████████████▋                | 142/232 [4:29:33<2:37:40, 105.12s/it] 62%|█████████████████████████▉                | 143/232 [4:31:18<2:35:44, 104.99s/it]                                                                                      {'loss': 2.3583, 'grad_norm': 0.15677917003631592, 'learning_rate': 3.402361777949229e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 598.1, 'epoch': 0.62}
 62%|█████████████████████████▉                | 143/232 [4:31:18<2:35:44, 104.99s/it] 62%|██████████████████████████                | 144/232 [4:33:04<2:34:22, 105.26s/it]                                                                                      {'loss': 2.6741, 'grad_norm': 0.26103436946868896, 'learning_rate': 3.336946368267724e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 505.75, 'epoch': 0.62}
 62%|██████████████████████████                | 144/232 [4:33:04<2:34:22, 105.26s/it][2025-10-07 16:24:22,771] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 16:24:25,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0497405529022217
[2025-10-07 16:24:26,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.035264253616333
[2025-10-07 16:24:27,150] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0325191020965576
[2025-10-07 16:24:28,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0278091430664062
[2025-10-07 16:24:28,179] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.76s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.66s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.23s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.75s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.87s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.05s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:36,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:30<01:29,  8.10s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.18s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.17s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.00s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.10s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:43<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.417459726333618, 'eval_runtime': 188.1589, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.62}
 62%|██████████████████████████                | 144/232 [4:36:18<2:34:22, 105.26s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.19s/it][A
                                                                                      [A[2025-10-07 16:27:36,365] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 16:27:47,153] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-144
 62%|██████████████████████████▎               | 145/232 [4:38:29<4:08:02, 171.07s/it]                                                                                      {'loss': 2.4198, 'grad_norm': 0.15687525272369385, 'learning_rate': 3.271849486395059e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 544.63, 'epoch': 0.62}
 62%|██████████████████████████▎               | 145/232 [4:38:29<4:08:02, 171.07s/it] 63%|██████████████████████████▍               | 146/232 [4:40:14<3:36:59, 151.39s/it]                                                                                      {'loss': 2.4322, 'grad_norm': 0.14793400466442108, 'learning_rate': 3.207083600460196e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 582.54, 'epoch': 0.63}
 63%|██████████████████████████▍               | 146/232 [4:40:14<3:36:59, 151.39s/it] 63%|██████████████████████████▌               | 147/232 [4:41:59<3:14:53, 137.57s/it]                                                                                      {'loss': 2.4873, 'grad_norm': 0.1773616373538971, 'learning_rate': 3.1426611151958146e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.19, 'epoch': 0.63}
 63%|██████████████████████████▌               | 147/232 [4:41:59<3:14:53, 137.57s/it] 64%|██████████████████████████▊               | 148/232 [4:43:44<2:58:45, 127.69s/it]                                                                                      {'loss': 2.4418, 'grad_norm': 0.1769264191389084, 'learning_rate': 3.078594369562417e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.38, 'epoch': 0.64}
 64%|██████████████████████████▊               | 148/232 [4:43:44<2:58:45, 127.69s/it] 64%|██████████████████████████▉               | 149/232 [4:45:29<2:47:15, 120.91s/it]                                                                                      {'loss': 2.4109, 'grad_norm': 0.1511746048927307, 'learning_rate': 3.0148956343850143e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 585.43, 'epoch': 0.64}
 64%|██████████████████████████▉               | 149/232 [4:45:29<2:47:15, 120.91s/it] 65%|███████████████████████████▏              | 150/232 [4:47:15<2:39:12, 116.50s/it]                                                                                      {'loss': 2.4526, 'grad_norm': 0.1719832420349121, 'learning_rate': 2.9515771100028854e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 617.13, 'epoch': 0.65}
 65%|███████████████████████████▏              | 150/232 [4:47:15<2:39:12, 116.50s/it] 65%|███████████████████████████▎              | 151/232 [4:49:00<2:32:29, 112.95s/it]                                                                                      {'loss': 2.4441, 'grad_norm': 0.1828099489212036, 'learning_rate': 2.888650923932815e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.57, 'epoch': 0.65}
 65%|███████████████████████████▎              | 151/232 [4:49:00<2:32:29, 112.95s/it] 66%|███████████████████████████▌              | 152/232 [4:50:45<2:27:29, 110.62s/it]                                                                                      {'loss': 2.277, 'grad_norm': 0.15714910626411438, 'learning_rate': 2.8261291285462843e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.21, 'epoch': 0.65}
 66%|███████████████████████████▌              | 152/232 [4:50:45<2:27:29, 110.62s/it] 66%|███████████████████████████▋              | 153/232 [4:52:27<2:21:59, 107.84s/it]                                                                                      {'loss': 2.4279, 'grad_norm': 0.15972191095352173, 'learning_rate': 2.7640236987610662e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 510.77, 'epoch': 0.66}
 66%|███████████████████████████▋              | 153/232 [4:52:27<2:21:59, 107.84s/it] 66%|███████████████████████████▉              | 154/232 [4:54:10<2:18:34, 106.60s/it]                                                                                      {'loss': 2.4685, 'grad_norm': 0.16657793521881104, 'learning_rate': 2.7023465297476426e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.24, 'epoch': 0.66}
 66%|███████████████████████████▉              | 154/232 [4:54:10<2:18:34, 106.60s/it] 67%|████████████████████████████              | 155/232 [4:55:56<2:16:17, 106.21s/it]                                                                                      {'loss': 2.4661, 'grad_norm': 0.19078649580478668, 'learning_rate': 2.641109434650894e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.4, 'epoch': 0.67}
 67%|████████████████████████████              | 155/232 [4:55:56<2:16:17, 106.21s/it] 67%|████████████████████████████▏             | 156/232 [4:57:39<2:13:34, 105.46s/it]                                                                                      {'loss': 2.4035, 'grad_norm': 0.20265750586986542, 'learning_rate': 2.580324142327516e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 535.76, 'epoch': 0.67}
 67%|████████████████████████████▏             | 156/232 [4:57:39<2:13:34, 105.46s/it] 68%|████████████████████████████▍             | 157/232 [4:59:23<2:11:14, 105.00s/it]                                                                                      {'loss': 2.3609, 'grad_norm': 0.18236300349235535, 'learning_rate': 2.520002295099564e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 629.39, 'epoch': 0.68}
 68%|████████████████████████████▍             | 157/232 [4:59:23<2:11:14, 105.00s/it] 68%|████████████████████████████▌             | 158/232 [5:01:09<2:09:40, 105.14s/it]                                                                                      {'loss': 2.6265, 'grad_norm': 0.18648919463157654, 'learning_rate': 2.460155446524573e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.42, 'epoch': 0.68}
 68%|████████████████████████████▌             | 158/232 [5:01:09<2:09:40, 105.14s/it] 69%|████████████████████████████▊             | 159/232 [5:02:53<2:07:48, 105.05s/it]                                                                                      {'loss': 2.5449, 'grad_norm': 0.8310457468032837, 'learning_rate': 2.400795059182692e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.21, 'epoch': 0.68}
 69%|████████████████████████████▊             | 159/232 [5:02:54<2:07:48, 105.05s/it] 69%|████████████████████████████▉             | 160/232 [5:04:37<2:05:36, 104.67s/it]                                                                                      {'loss': 2.3449, 'grad_norm': 0.15494287014007568, 'learning_rate': 2.341932502481226e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 537.9, 'epoch': 0.69}
 69%|████████████████████████████▉             | 160/232 [5:04:37<2:05:36, 104.67s/it] 69%|█████████████████████████████▏            | 161/232 [5:06:19<2:02:57, 103.91s/it]                                                                                      {'loss': 2.3852, 'grad_norm': 0.34528088569641113, 'learning_rate': 2.283579050477042e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 608.59, 'epoch': 0.69}
 69%|█████████████████████████████▏            | 161/232 [5:06:19<2:02:57, 103.91s/it] 70%|█████████████████████████████▎            | 162/232 [5:08:04<2:01:32, 104.18s/it]                                                                                      {'loss': 2.41, 'grad_norm': 0.18780948221683502, 'learning_rate': 2.2257458797172093e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 470.71, 'epoch': 0.7}
 70%|█████████████████████████████▎            | 162/232 [5:08:04<2:01:32, 104.18s/it] 70%|█████████████████████████████▌            | 163/232 [5:09:49<1:59:52, 104.23s/it]                                                                                      {'loss': 2.449, 'grad_norm': 0.18119558691978455, 'learning_rate': 2.1684440670983568e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 598.12, 'epoch': 0.7}
 70%|█████████████████████████████▌            | 163/232 [5:09:49<1:59:52, 104.23s/it] 71%|█████████████████████████████▋            | 164/232 [5:11:34<1:58:23, 104.46s/it]                                                                                      {'loss': 2.3244, 'grad_norm': 0.1482125073671341, 'learning_rate': 2.111684587745081e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 533.73, 'epoch': 0.71}
 71%|█████████████████████████████▋            | 164/232 [5:11:34<1:58:23, 104.46s/it] 71%|█████████████████████████████▊            | 165/232 [5:13:17<1:56:27, 104.29s/it]                                                                                      {'loss': 2.4997, 'grad_norm': 0.17195050418376923, 'learning_rate': 2.0554783129078564e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 482.3, 'epoch': 0.71}
 71%|█████████████████████████████▊            | 165/232 [5:13:17<1:56:27, 104.29s/it] 72%|██████████████████████████████            | 166/232 [5:15:02<1:54:46, 104.35s/it]                                                                                      {'loss': 2.5106, 'grad_norm': 0.15829245746135712, 'learning_rate': 1.9998360078808547e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 544.61, 'epoch': 0.71}
 72%|██████████████████████████████            | 166/232 [5:15:02<1:54:46, 104.35s/it] 72%|██████████████████████████████▏           | 167/232 [5:16:47<1:53:19, 104.60s/it]                                                                                      {'loss': 2.6374, 'grad_norm': 0.29079005122184753, 'learning_rate': 1.944768329940045e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 611.46, 'epoch': 0.72}
 72%|██████████████████████████████▏           | 167/232 [5:16:47<1:53:19, 104.60s/it] 72%|██████████████████████████████▍           | 168/232 [5:18:33<1:52:00, 105.01s/it]                                                                                      {'loss': 2.3114, 'grad_norm': 0.14395098388195038, 'learning_rate': 1.8902858263019746e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 561.31, 'epoch': 0.72}
 72%|██████████████████████████████▍           | 168/232 [5:18:33<1:52:00, 105.01s/it][2025-10-07 17:09:51,859] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 17:09:54,453] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1820712089538574
[2025-10-07 17:09:55,648] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1941492557525635
[2025-10-07 17:09:56,840] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1920030117034912
[2025-10-07 17:09:58,008] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1677920818328857
[2025-10-07 17:09:58,009] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.08s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.78s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.68s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.25s/it][A
 26%|█████████████                                     | 6/23 [00:41<02:08,  7.57s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:04,  7.76s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.88s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.03s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.07s/it][A
 48%|███████████████████████▍                         | 11/23 [01:22<01:37,  8.10s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:30<01:29,  8.11s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.18s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.17s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.17s/it][A
 70%|██████████████████████████████████               | 16/23 [02:02<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.01s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:43<00:16,  8.18s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.20s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.4160118103027344, 'eval_runtime': 188.376, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.196, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.72}
 72%|██████████████████████████████▍           | 168/232 [5:21:48<1:52:00, 105.01s/it]
100%|█████████████████████████████████████████████████| 23/23 [03:00<00:00,  8.20s/it][A
                                                                                      [A[2025-10-07 17:13:06,394] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 17:13:16,663] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-168
 73%|██████████████████████████████▌           | 169/232 [5:23:58<2:59:27, 170.91s/it]                                                                                      {'loss': 2.4697, 'grad_norm': 0.1664450615644455, 'learning_rate': 1.836398932103658e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 541.01, 'epoch': 0.73}
 73%|██████████████████████████████▌           | 169/232 [5:23:58<2:59:27, 170.91s/it] 73%|██████████████████████████████▊           | 170/232 [5:25:44<2:36:41, 151.64s/it]                                                                                      {'loss': 2.4647, 'grad_norm': 0.16507600247859955, 'learning_rate': 1.7831179684039041e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 620.34, 'epoch': 0.73}
 73%|██████████████████████████████▊           | 170/232 [5:25:44<2:36:41, 151.64s/it] 74%|██████████████████████████████▉           | 171/232 [5:27:25<2:18:38, 136.38s/it]                                                                                      {'loss': 2.5393, 'grad_norm': 0.2299947887659073, 'learning_rate': 1.7304531402065033e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 554.48, 'epoch': 0.74}
 74%|██████████████████████████████▉           | 171/232 [5:27:25<2:18:38, 136.38s/it] 74%|███████████████████████████████▏          | 172/232 [5:29:10<2:06:59, 126.99s/it]                                                                                      {'loss': 2.3934, 'grad_norm': 0.26125118136405945, 'learning_rate': 1.6784145345056519e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 454.37, 'epoch': 0.74}
 74%|███████████████████████████████▏          | 172/232 [5:29:10<2:06:59, 126.99s/it] 75%|███████████████████████████████▎          | 173/232 [5:30:53<1:57:44, 119.74s/it]                                                                                      {'loss': 2.4524, 'grad_norm': 0.25597989559173584, 'learning_rate': 1.627012118353965e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 552.18, 'epoch': 0.74}
 75%|███████████████████████████████▎          | 173/232 [5:30:53<1:57:44, 119.74s/it] 75%|███████████████████████████████▌          | 174/232 [5:32:38<1:51:31, 115.37s/it]                                                                                      {'loss': 2.3758, 'grad_norm': 0.15529395639896393, 'learning_rate': 1.5762557369534709e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.66, 'epoch': 0.75}
 75%|███████████████████████████████▌          | 174/232 [5:32:38<1:51:31, 115.37s/it] 75%|███████████████████████████████▋          | 175/232 [5:34:24<1:46:52, 112.49s/it]                                                                                      {'loss': 2.5284, 'grad_norm': 0.19255177676677704, 'learning_rate': 1.5261551117699358e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.51, 'epoch': 0.75}
 75%|███████████████████████████████▋          | 175/232 [5:34:24<1:46:52, 112.49s/it] 76%|███████████████████████████████▊          | 176/232 [5:36:09<1:42:49, 110.16s/it]                                                                                      {'loss': 2.4243, 'grad_norm': 0.29165372252464294, 'learning_rate': 1.4767198386708998e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.81, 'epoch': 0.76}
 76%|███████████████████████████████▊          | 176/232 [5:36:09<1:42:49, 110.16s/it] 76%|████████████████████████████████          | 177/232 [5:37:54<1:39:43, 108.79s/it]                                                                                      {'loss': 2.652, 'grad_norm': 0.17469967901706696, 'learning_rate': 1.427959386087761e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 620.67, 'epoch': 0.76}
 76%|████████████████████████████████          | 177/232 [5:37:54<1:39:43, 108.79s/it] 77%|████████████████████████████████▏         | 178/232 [5:39:40<1:36:58, 107.75s/it]                                                                                      {'loss': 2.3821, 'grad_norm': 0.16631750762462616, 'learning_rate': 1.3798830932022616e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.11, 'epoch': 0.77}
 77%|████████████████████████████████▏         | 178/232 [5:39:40<1:36:58, 107.75s/it] 77%|████████████████████████████████▍         | 179/232 [5:41:24<1:34:20, 106.80s/it]                                                                                      {'loss': 2.4935, 'grad_norm': 0.1531817764043808, 'learning_rate': 1.3325001681577482e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 572.08, 'epoch': 0.77}
 77%|████████████████████████████████▍         | 179/232 [5:41:24<1:34:20, 106.80s/it] 78%|████████████████████████████████▌         | 180/232 [5:43:11<1:32:24, 106.63s/it]                                                                                      {'loss': 2.4493, 'grad_norm': 0.1834908127784729, 'learning_rate': 1.2858196862955108e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.93, 'epoch': 0.77}
 78%|████████████████████████████████▌         | 180/232 [5:43:11<1:32:24, 106.63s/it] 78%|████████████████████████████████▊         | 181/232 [5:44:55<1:30:09, 106.07s/it]                                                                                      {'loss': 2.3978, 'grad_norm': 0.1634717881679535, 'learning_rate': 1.2398505884165652e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.54, 'epoch': 0.78}
 78%|████████████████████████████████▊         | 181/232 [5:44:55<1:30:09, 106.07s/it] 78%|████████████████████████████████▉         | 182/232 [5:46:40<1:28:09, 105.80s/it]                                                                                      {'loss': 2.5803, 'grad_norm': 0.1714055985212326, 'learning_rate': 1.1946016790692094e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 599.66, 'epoch': 0.78}
 78%|████████████████████████████████▉         | 182/232 [5:46:40<1:28:09, 105.80s/it] 79%|█████████████████████████████████▏        | 183/232 [5:48:25<1:26:03, 105.38s/it]                                                                                      {'loss': 2.3313, 'grad_norm': 0.16218791902065277, 'learning_rate': 1.1500816248626711e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 493.83, 'epoch': 0.79}
 79%|█████████████████████████████████▏        | 183/232 [5:48:25<1:26:03, 105.38s/it] 79%|█████████████████████████████████▎        | 184/232 [5:50:10<1:24:16, 105.33s/it]                                                                                      {'loss': 2.3898, 'grad_norm': 0.1729874461889267, 'learning_rate': 1.1062989528071683e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.82, 'epoch': 0.79}
 79%|█████████████████████████████████▎        | 184/232 [5:50:10<1:24:16, 105.33s/it] 80%|█████████████████████████████████▍        | 185/232 [5:51:56<1:22:32, 105.37s/it]                                                                                      {'loss': 2.3343, 'grad_norm': 0.1679660528898239, 'learning_rate': 1.0632620486807244e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.54, 'epoch': 0.8}
 80%|█████████████████████████████████▍        | 185/232 [5:51:56<1:22:32, 105.37s/it] 80%|█████████████████████████████████▋        | 186/232 [5:53:40<1:20:35, 105.13s/it]                                                                                      {'loss': 2.309, 'grad_norm': 0.16071046888828278, 'learning_rate': 1.0209791554230209e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 498.13, 'epoch': 0.8}
 80%|█████████████████████████████████▋        | 186/232 [5:53:40<1:20:35, 105.13s/it] 81%|█████████████████████████████████▊        | 187/232 [5:55:25<1:18:52, 105.18s/it]                                                                                      {'loss': 2.4983, 'grad_norm': 0.16588424146175385, 'learning_rate': 9.79458371556607e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 581.82, 'epoch': 0.8}
 81%|█████████████████████████████████▊        | 187/232 [5:55:25<1:18:52, 105.18s/it] 81%|██████████████████████████████████        | 188/232 [5:57:11<1:17:10, 105.25s/it]                                                                                      {'loss': 2.3184, 'grad_norm': 0.23015336692333221, 'learning_rate': 9.387076496357805e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.74, 'epoch': 0.81}
 81%|██████████████████████████████████        | 188/232 [5:57:11<1:17:10, 105.25s/it] 81%|██████████████████████████████████▏       | 189/232 [5:58:55<1:15:16, 105.04s/it]                                                                                      {'loss': 2.379, 'grad_norm': 0.1979931890964508, 'learning_rate': 8.987347947234193e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.2, 'epoch': 0.81}
 81%|██████████████████████████████████▏       | 189/232 [5:58:55<1:15:16, 105.04s/it] 82%|██████████████████████████████████▍       | 190/232 [6:00:41<1:13:41, 105.28s/it]                                                                                      {'loss': 2.3405, 'grad_norm': 0.15547919273376465, 'learning_rate': 8.595474628960598e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 593.02, 'epoch': 0.82}
 82%|██████████████████████████████████▍       | 190/232 [6:00:41<1:13:41, 105.28s/it] 82%|██████████████████████████████████▌       | 191/232 [6:02:26<1:11:46, 105.04s/it]                                                                                      {'loss': 2.4418, 'grad_norm': 0.1689731627702713, 'learning_rate': 8.211531597775136e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.69, 'epoch': 0.82}
 82%|██████████████████████████████████▌       | 191/232 [6:02:26<1:11:46, 105.04s/it] 83%|██████████████████████████████████▊       | 192/232 [6:04:11<1:10:10, 105.27s/it]                                                                                      {'loss': 2.3292, 'grad_norm': 0.19969233870506287, 'learning_rate': 7.835592391013053e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 389.12, 'epoch': 0.83}
 83%|██████████████████████████████████▊       | 192/232 [6:04:11<1:10:10, 105.27s/it][2025-10-07 17:55:30,257] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 17:55:32,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0582835674285889
[2025-10-07 17:55:33,637] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0531456470489502
[2025-10-07 17:55:34,689] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0518488883972168
[2025-10-07 17:55:35,735] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0457401275634766
[2025-10-07 17:55:35,736] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.76s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.66s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.23s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.75s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.88s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.06s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:37,  8.08s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:30<01:29,  8.10s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.18s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.17s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.01s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.07s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:43<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.20s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.4152867794036865, 'eval_runtime': 188.2145, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.83}
 83%|██████████████████████████████████▊       | 192/232 [6:07:25<1:10:10, 105.27s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.20s/it][A
                                                                                      [A[2025-10-07 17:58:43,960] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 17:58:54,750] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-192
 83%|██████████████████████████████████▉       | 193/232 [6:09:32<1:50:27, 169.92s/it]                                                                                      {'loss': 2.3347, 'grad_norm': 0.16426128149032593, 'learning_rate': 7.467729013021979e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 554.67, 'epoch': 0.83}
 83%|██████████████████████████████████▉       | 193/232 [6:09:32<1:50:27, 169.92s/it] 84%|███████████████████████████████████       | 194/232 [6:11:15<1:34:52, 149.80s/it]                                                                                      {'loss': 2.3597, 'grad_norm': 0.16122335195541382, 'learning_rate': 7.108011921370728e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 606.77, 'epoch': 0.83}
 84%|███████████████████████████████████       | 194/232 [6:11:15<1:34:52, 149.80s/it] 84%|███████████████████████████████████▎      | 195/232 [6:13:00<1:24:07, 136.42s/it]                                                                                      {'loss': 2.3711, 'grad_norm': 0.1594630926847458, 'learning_rate': 6.756510013354512e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 544.71, 'epoch': 0.84}
 84%|███████████████████████████████████▎      | 195/232 [6:13:00<1:24:07, 136.42s/it] 84%|███████████████████████████████████▍      | 196/232 [6:14:45<1:16:08, 126.89s/it]                                                                                      {'loss': 2.4303, 'grad_norm': 0.17642569541931152, 'learning_rate': 6.413290612798883e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.32, 'epoch': 0.84}
 84%|███████████████████████████████████▍      | 196/232 [6:14:45<1:16:08, 126.89s/it] 85%|███████████████████████████████████▋      | 197/232 [6:16:30<1:10:11, 120.34s/it]                                                                                      {'loss': 2.2769, 'grad_norm': 0.14454026520252228, 'learning_rate': 6.078419457165036e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.95, 'epoch': 0.85}
 85%|███████████████████████████████████▋      | 197/232 [6:16:30<1:10:11, 120.34s/it] 85%|███████████████████████████████████▊      | 198/232 [6:18:15<1:05:37, 115.82s/it]                                                                                      {'loss': 2.3751, 'grad_norm': 0.23137526214122772, 'learning_rate': 5.751960684959046e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 556.73, 'epoch': 0.85}
 85%|███████████████████████████████████▊      | 198/232 [6:18:15<1:05:37, 115.82s/it] 86%|████████████████████████████████████      | 199/232 [6:20:00<1:01:52, 112.50s/it]                                                                                      {'loss': 2.5033, 'grad_norm': 0.1744750589132309, 'learning_rate': 5.433976823447262e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.73, 'epoch': 0.86}
 86%|████████████████████████████████████      | 199/232 [6:20:00<1:01:52, 112.50s/it] 86%|█████████████████████████████████████▉      | 200/232 [6:21:46<58:59, 110.62s/it]                                                                                      {'loss': 2.5263, 'grad_norm': 0.1891147941350937, 'learning_rate': 5.124528776680371e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.83, 'epoch': 0.86}
 86%|█████████████████████████████████████▉      | 200/232 [6:21:46<58:59, 110.62s/it] 87%|██████████████████████████████████████      | 201/232 [6:23:31<56:15, 108.89s/it]                                                                                      {'loss': 2.405, 'grad_norm': 0.17183446884155273, 'learning_rate': 4.823675813828271e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.1, 'epoch': 0.86}
 87%|██████████████████████████████████████      | 201/232 [6:23:31<56:15, 108.89s/it] 87%|██████████████████████████████████████▎     | 202/232 [6:25:16<53:52, 107.76s/it]                                                                                      {'loss': 2.3735, 'grad_norm': 0.15647412836551666, 'learning_rate': 4.531475557828202e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.38, 'epoch': 0.87}
 87%|██████████████████████████████████████▎     | 202/232 [6:25:16<53:52, 107.76s/it] 88%|██████████████████████████████████████▌     | 203/232 [6:27:01<51:38, 106.83s/it]                                                                                      {'loss': 2.371, 'grad_norm': 0.1775280237197876, 'learning_rate': 4.2479839743480965e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.13, 'epoch': 0.87}
 88%|██████████████████████████████████████▌     | 203/232 [6:27:01<51:38, 106.83s/it] 88%|██████████████████████████████████████▋     | 204/232 [6:28:46<49:36, 106.31s/it]                                                                                      {'loss': 2.4249, 'grad_norm': 0.1871228665113449, 'learning_rate': 3.9732553610673465e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.87, 'epoch': 0.88}
 88%|██████████████████████████████████████▋     | 204/232 [6:28:46<49:36, 106.31s/it] 88%|██████████████████████████████████████▉     | 205/232 [6:30:31<47:40, 105.93s/it]                                                                                      {'loss': 2.4219, 'grad_norm': 0.18398220837116241, 'learning_rate': 3.7073423372770754e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.12, 'epoch': 0.88}
 88%|██████████████████████████████████████▉     | 205/232 [6:30:31<47:40, 105.93s/it] 89%|███████████████████████████████████████     | 206/232 [6:32:16<45:44, 105.54s/it]                                                                                      {'loss': 2.3863, 'grad_norm': 0.14451023936271667, 'learning_rate': 3.4502958338018754e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.47, 'epoch': 0.89}
 89%|███████████████████████████████████████     | 206/232 [6:32:16<45:44, 105.54s/it] 89%|███████████████████████████████████████▎    | 207/232 [6:34:01<43:55, 105.42s/it]                                                                                      {'loss': 2.4147, 'grad_norm': 0.15192757546901703, 'learning_rate': 3.20216508324494e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.78, 'epoch': 0.89}
 89%|███████████████████████████████████████▎    | 207/232 [6:34:01<43:55, 105.42s/it] 90%|███████████████████████████████████████▍    | 208/232 [6:35:46<42:08, 105.34s/it]                                                                                      {'loss': 2.303, 'grad_norm': 0.1576743721961975, 'learning_rate': 2.9629976105584266e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.21, 'epoch': 0.89}
 90%|███████████████████████████████████████▍    | 208/232 [6:35:46<42:08, 105.34s/it] 90%|███████████████████████████████████████▋    | 209/232 [6:37:31<40:18, 105.14s/it]                                                                                      {'loss': 2.49, 'grad_norm': 0.16963180899620056, 'learning_rate': 2.732839223940914e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 594.11, 'epoch': 0.9}
 90%|███████████████████████████████████████▋    | 209/232 [6:37:31<40:18, 105.14s/it] 91%|███████████████████████████████████████▊    | 210/232 [6:39:17<38:38, 105.40s/it]                                                                                      {'loss': 2.44, 'grad_norm': 0.1585075557231903, 'learning_rate': 2.5117340060636817e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 591.26, 'epoch': 0.9}
 91%|███████████████████████████████████████▊    | 210/232 [6:39:17<38:38, 105.40s/it] 91%|████████████████████████████████████████    | 211/232 [6:41:01<36:49, 105.19s/it]                                                                                      {'loss': 2.4358, 'grad_norm': 0.16591255366802216, 'learning_rate': 2.2997243056274822e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.0, 'epoch': 0.91}
 91%|████████████████████████████████████████    | 211/232 [6:41:01<36:49, 105.19s/it] 91%|████████████████████████████████████████▏   | 212/232 [6:42:46<35:03, 105.16s/it]                                                                                      {'loss': 2.2981, 'grad_norm': 0.1462378203868866, 'learning_rate': 2.096850729251404e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.6, 'epoch': 0.91}
 91%|████████████████████████████████████████▏   | 212/232 [6:42:47<35:03, 105.16s/it] 92%|████████████████████████████████████████▍   | 213/232 [6:44:32<33:17, 105.15s/it]                                                                                      {'loss': 2.3811, 'grad_norm': 0.1623452752828598, 'learning_rate': 1.903152133695385e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 579.91, 'epoch': 0.92}
 92%|████████████████████████████████████████▍   | 213/232 [6:44:32<33:17, 105.15s/it] 92%|████████████████████████████████████████▌   | 214/232 [6:46:17<31:31, 105.09s/it]                                                                                      {'loss': 2.397, 'grad_norm': 0.15678995847702026, 'learning_rate': 1.7186656184179475e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 560.78, 'epoch': 0.92}
 92%|████████████████████████████████████████▌   | 214/232 [6:46:17<31:31, 105.09s/it] 93%|████████████████████████████████████████▊   | 215/232 [6:48:02<29:47, 105.15s/it]                                                                                      {'loss': 2.4259, 'grad_norm': 0.20976859331130981, 'learning_rate': 1.543426518470431e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.46, 'epoch': 0.92}
 93%|████████████████████████████████████████▊   | 215/232 [6:48:02<29:47, 105.15s/it] 93%|████████████████████████████████████████▉   | 216/232 [6:49:47<28:04, 105.28s/it]                                                                                      {'loss': 2.5872, 'grad_norm': 0.18247883021831512, 'learning_rate': 1.3774683977292426e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 620.68, 'epoch': 0.93}
 93%|████████████████████████████████████████▉   | 216/232 [6:49:47<28:04, 105.28s/it][2025-10-07 18:41:06,193] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 18:41:08,512] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.050823450088501
[2025-10-07 18:41:09,574] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.061586856842041
[2025-10-07 18:41:10,635] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.060603380203247
[2025-10-07 18:41:11,689] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0533573627471924
[2025-10-07 18:41:11,690] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]

  0%|                                                          | 0/23 [00:00<?, ?it/s][A
  9%|████▎                                             | 2/23 [00:08<01:25,  4.06s/it][A
 13%|██████▌                                           | 3/23 [00:16<01:55,  5.76s/it][A
 17%|████████▋                                         | 4/23 [00:24<02:06,  6.66s/it][A
 22%|██████████▊                                       | 5/23 [00:32<02:10,  7.23s/it][A
 26%|█████████████                                     | 6/23 [00:40<02:08,  7.56s/it][A
 30%|███████████████▏                                  | 7/23 [00:49<02:03,  7.74s/it][A
 35%|█████████████████▍                                | 8/23 [00:57<01:58,  7.87s/it][A
 39%|███████████████████▌                              | 9/23 [01:05<01:52,  8.02s/it][A
 43%|█████████████████████▎                           | 10/23 [01:13<01:44,  8.06s/it][A
 48%|███████████████████████▍                         | 11/23 [01:21<01:37,  8.09s/it][A
 52%|█████████████████████████▌                       | 12/23 [01:30<01:29,  8.11s/it][A
 57%|███████████████████████████▋                     | 13/23 [01:38<01:21,  8.18s/it][A
 61%|█████████████████████████████▊                   | 14/23 [01:46<01:13,  8.17s/it][A
 65%|███████████████████████████████▉                 | 15/23 [01:54<01:05,  8.16s/it][A
 70%|██████████████████████████████████               | 16/23 [02:01<00:55,  7.90s/it][A
 74%|████████████████████████████████████▏            | 17/23 [02:10<00:48,  8.01s/it][A
 78%|██████████████████████████████████████▎          | 18/23 [02:18<00:40,  8.06s/it][A
 83%|████████████████████████████████████████▍        | 19/23 [02:26<00:32,  8.09s/it][A
 87%|██████████████████████████████████████████▌      | 20/23 [02:34<00:24,  8.11s/it][A
 91%|████████████████████████████████████████████▋    | 21/23 [02:43<00:16,  8.17s/it][A
 96%|██████████████████████████████████████████████▊  | 22/23 [02:51<00:08,  8.18s/it][A
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.20s/it][A                                                                                      
                                                                                      [A{'eval_loss': 2.4150502681732178, 'eval_runtime': 188.221, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.93}
 93%|████████████████████████████████████████▉   | 216/232 [6:53:01<28:04, 105.28s/it]
100%|█████████████████████████████████████████████████| 23/23 [02:59<00:00,  8.20s/it][A
                                                                                      [A[2025-10-07 18:44:19,920] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 18:44:30,660] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-216
 94%|█████████████████████████████████████████▏  | 217/232 [6:55:11<42:40, 170.68s/it]                                                                                      {'loss': 2.4799, 'grad_norm': 0.2342216670513153, 'learning_rate': 1.2208230424672562e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 458.67, 'epoch': 0.93}
 94%|█████████████████████████████████████████▏  | 217/232 [6:55:11<42:40, 170.68s/it] 94%|█████████████████████████████████████████▎  | 218/232 [6:56:57<35:18, 151.30s/it]                                                                                      {'loss': 2.4609, 'grad_norm': 0.20292286574840546, 'learning_rate': 1.0735204552657641e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 623.66, 'epoch': 0.94}
 94%|█████████████████████████████████████████▎  | 218/232 [6:56:57<35:18, 151.30s/it] 94%|█████████████████████████████████████████▌  | 219/232 [6:58:42<29:47, 137.53s/it]                                                                                      {'loss': 2.3216, 'grad_norm': 0.18959718942642212, 'learning_rate': 9.355888492680155e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.9, 'epoch': 0.94}
 94%|█████████████████████████████████████████▌  | 219/232 [6:58:42<29:47, 137.53s/it] 95%|█████████████████████████████████████████▋  | 220/232 [7:00:28<25:35, 127.94s/it]                                                                                      {'loss': 2.3656, 'grad_norm': 0.15441013872623444, 'learning_rate': 8.070546427754899e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 614.72, 'epoch': 0.95}
 95%|█████████████████████████████████████████▋  | 220/232 [7:00:28<25:35, 127.94s/it] 95%|█████████████████████████████████████████▉  | 221/232 [7:02:12<22:10, 120.97s/it]                                                                                      {'loss': 2.415, 'grad_norm': 0.15820012986660004, 'learning_rate': 6.879424541879676e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 579.88, 'epoch': 0.95}
 95%|█████████████████████████████████████████▉  | 221/232 [7:02:12<22:10, 120.97s/it] 96%|██████████████████████████████████████████  | 222/232 [7:03:58<19:22, 116.23s/it]                                                                                      {'loss': 2.4204, 'grad_norm': 0.18182291090488434, 'learning_rate': 5.782750972883111e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 614.62, 'epoch': 0.95}
 96%|██████████████████████████████████████████  | 222/232 [7:03:58<19:22, 116.23s/it] 96%|██████████████████████████████████████████▎ | 223/232 [7:05:42<16:54, 112.77s/it]                                                                                      {'loss': 2.4713, 'grad_norm': 0.16219539940357208, 'learning_rate': 4.780735768728895e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.89, 'epoch': 0.96}
 96%|██████████████████████████████████████████▎ | 223/232 [7:05:42<16:54, 112.77s/it] 97%|██████████████████████████████████████████▍ | 224/232 [7:07:28<14:44, 110.54s/it]                                                                                      {'loss': 2.3601, 'grad_norm': 0.1545465886592865, 'learning_rate': 3.873570847285013e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.2, 'epoch': 0.96}
 97%|██████████████████████████████████████████▍ | 224/232 [7:07:28<14:44, 110.54s/it] 97%|██████████████████████████████████████████▋ | 225/232 [7:09:13<12:42, 108.97s/it]                                                                                      {'loss': 2.4025, 'grad_norm': 0.20205241441726685, 'learning_rate': 3.0614299595654875e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.33, 'epoch': 0.97}
 97%|██████████████████████████████████████████▋ | 225/232 [7:09:13<12:42, 108.97s/it] 97%|██████████████████████████████████████████▊ | 226/232 [7:10:58<10:45, 107.67s/it]                                                                                      {'loss': 2.5119, 'grad_norm': 0.17662324011325836, 'learning_rate': 2.3444686564511042e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 580.82, 'epoch': 0.97}
 97%|██████████████████████████████████████████▊ | 226/232 [7:10:58<10:45, 107.67s/it] 98%|███████████████████████████████████████████ | 227/232 [7:12:43<08:54, 106.96s/it]                                                                                      {'loss': 2.3202, 'grad_norm': 0.18644018471240997, 'learning_rate': 1.7228242588969714e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.32, 'epoch': 0.98}
 98%|███████████████████████████████████████████ | 227/232 [7:12:43<08:54, 106.96s/it] 98%|███████████████████████████████████████████▏| 228/232 [7:14:28<07:05, 106.44s/it]                                                                                      {'loss': 2.3215, 'grad_norm': 0.14924494922161102, 'learning_rate': 1.1966158316307208e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.23, 'epoch': 0.98}
 98%|███████████████████████████████████████████▏| 228/232 [7:14:28<07:05, 106.44s/it] 99%|███████████████████████████████████████████▍| 229/232 [7:16:13<05:17, 105.91s/it]                                                                                      {'loss': 2.554, 'grad_norm': 0.18786196410655975, 'learning_rate': 7.65944160348142e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 524.39, 'epoch': 0.98}
 99%|███████████████████████████████████████████▍| 229/232 [7:16:13<05:17, 105.91s/it] 99%|███████████████████████████████████████████▌| 230/232 [7:17:59<03:31, 105.97s/it]                                                                                      {'loss': 2.3815, 'grad_norm': 0.16499285399913788, 'learning_rate': 4.308917324092887e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 451.24, 'epoch': 0.99}
 99%|███████████████████████████████████████████▌| 230/232 [7:17:59<03:31, 105.97s/it]100%|███████████████████████████████████████████▊| 231/232 [7:19:44<01:45, 105.61s/it]                                                                                      {'loss': 2.3755, 'grad_norm': 0.1830398291349411, 'learning_rate': 1.9152272103972746e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 550.24, 'epoch': 0.99}
100%|███████████████████████████████████████████▊| 231/232 [7:19:44<01:45, 105.61s/it]100%|████████████████████████████████████████████| 232/232 [7:21:30<00:00, 105.76s/it]                                                                                      {'loss': 2.3878, 'grad_norm': 0.18990111351013184, 'learning_rate': 4.788297303903732e-10, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 535.43, 'epoch': 1.0}
100%|████████████████████████████████████████████| 232/232 [7:21:30<00:00, 105.76s/it][2025-10-07 19:12:48,537] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 19:12:57,756] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-232
                                                                                      {'train_runtime': 26517.4203, 'train_samples_per_second': 0.07, 'train_steps_per_second': 0.009, 'train_loss': 2.4443190539705344, 'memory/max_active (GiB)': 4.12, 'memory/max_allocated (GiB)': 4.12, 'memory/device_reserved (GiB)': 4.18, 'epoch': 1.0}
100%|████████████████████████████████████████████| 232/232 [7:21:54<00:00, 105.76s/it]100%|████████████████████████████████████████████| 232/232 [7:21:54<00:00, 114.29s/it]
[2025-10-07 19:16:04,232] [INFO] [axolotl.train.save_trained_model:225] [PID:8314] Training completed! Saving trained model to ckpts-mmarv.
[2025-10-07 19:16:04,250] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2025-10-07 19:16:13,614] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
[2025-10-07 19:16:25,556] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
Processing Files (0 / 0)                : |              |  0.00B /  0.00B            
New Data Upload                         : |              |  0.00B /  0.00B            [A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  29%|██▉       |  134MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  29%|██▉       |  134MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  32%|███▏      |  151MB /  473MB,   ???B/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  61%|██████    |  277MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  62%|██████▏   |  294MB /  473MB,  711MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  90%|█████████ |  411MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  90%|█████████ |  428MB /  473MB,  691MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (3 / 3)                : 100%|██████████|  473MB /  473MB,  537MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (3 / 3)                : 100%|██████████|  473MB /  473MB,  201MB/s  
New Data Upload                         : |              |  0.00B /  0.00B,  0.00B/s  
  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            
  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            
  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            
[2025-10-07 19:16:31,415] [INFO] [axolotl.train.save_trained_model:346] [PID:8314] Model successfully saved to ckpts-mmarv
[2025-10-07 19:16:41,179] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
Processing Files (0 / 0)                : |              |  0.00B /  0.00B            
New Data Upload                         : |              |  0.00B /  0.00B            [A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  31%|███       |  143MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  31%|███       |  143MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  34%|███▎      |  160MB /  473MB,   ???B/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  61%|██████    |  277MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  62%|██████▏   |  294MB /  473MB,  670MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors:  92%|█████████▏|  419MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (2 / 3)                :  92%|█████████▏|  437MB /  473MB,  692MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (3 / 3)                : 100%|██████████|  473MB /  473MB,  523MB/s  

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[A

  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            [A[A


  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            [A[A[A


  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB            [A[A[A[AProcessing Files (3 / 3)                : 100%|██████████|  473MB /  473MB,  224MB/s  
New Data Upload                         : |              |  0.00B /  0.00B,  0.00B/s  
  ...ining/ckpts-mmarv/training_args.bin: 100%|██████████| 7.95kB / 7.95kB            
  ...pts-mmarv/adapter_model.safetensors: 100%|██████████|  456MB /  456MB            
  ...training/ckpts-mmarv/tokenizer.json: 100%|██████████| 17.1MB / 17.1MB