File size: 28,085 Bytes

1f5ec7c

[2025-10-25 17:49:53,747] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:4001] bf16 support detected, enabling for this configuration.
[2025-10-25 17:49:53,988] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:4001] baseline 0.000GB ()
[2025-10-25 17:49:53,988] [INFO] [axolotl.cli.config.load_cfg:248] [PID:4001] config:
{
  "activation_offloading": true,
  "axolotl_config_path": "train.yml",
  "base_model": "Qwen/Qwen3-4B-Instruct-2507",
  "base_model_config": "Qwen/Qwen3-4B-Instruct-2507",
  "batch_size": 4,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "chat_template": "tokenizer_default",
  "context_parallel_size": 1,
  "cosine_min_lr_ratio": 0.1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "WokeAI/polititune-tankie-warmup",
      "split": "train",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "include_tkps": true,
  "learning_rate": 1e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "qwen3",
  "num_epochs": 2.0,
  "optimizer": "paged_ademamix_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./model-output",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": true,
  "save_safetensors": true,
  "save_steps": 0.25,
  "saves_per_epoch": 2,
  "sequence_len": 2048,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "eos_token": "<|im_end|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_project": "polititune-q34b-warmup",
  "warmup_ratio": 0.05,
  "weight_decay": 0.01,
  "world_size": 1
}
[2025-10-25 17:49:53,990] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:4001] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <|im_end|>
[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None
[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <|endoftext|>
[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None
[2025-10-25 17:49:54,858] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:4001] Loading prepared dataset from disk at last_run_prepared/a9098d9a4841d51fd558499bade3d148...
[2025-10-25 17:49:54,863] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:4001] total_num_tokens: 88_397
[2025-10-25 17:49:54,864] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:4001] `total_supervised_tokens: 81_792`
[2025-10-25 17:49:54,866] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
[2025-10-25 17:49:55,435] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
[2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.151627779006958
[2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
[2025-10-25 17:49:55,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.14948749542236328
[2025-10-25 17:49:55,737] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
[2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.15515494346618652
[2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
[2025-10-25 17:49:56,073] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.18137788772583008
[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46]
[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:4001] data_loader_len: 11
[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:4001] sample_packing_eff_est across ranks: [0.9383173403532609]
[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:4001] sample_packing_eff_est: 0.94
[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:4001] total_num_steps: 22
[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:4001] Maximum number of steps set at 22
[2025-10-25 17:49:56,115] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:4001] Loading tokenizer... Qwen/Qwen3-4B-Instruct-2507
[2025-10-25 17:49:56,797] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <|im_end|>
[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None
[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <|endoftext|>
[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None
[2025-10-25 17:49:56,798] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:4001] Loading model
[2025-10-25 17:49:57,139] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:4001] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-25 17:49:57,140] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:4001] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-10-25 17:49:57,140] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:4001] Applying multipack dataloader patch for sample packing...

Loading checkpoint shards:   0%|                                               | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|███████████████████████████████████████| 3/3 [00:00<00:00, 78.33it/s]
[2025-10-25 17:49:58,776] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:4001] Converting modules to torch.bfloat16
[2025-10-25 17:49:59,230] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:4001] Memory usage after model load 0.000GB ()
[2025-10-25 17:50:01,453] [INFO] [axolotl.train.save_initial_configs:402] [PID:4001] Pre-saving tokenizer to ./model-output...
[2025-10-25 17:50:01,532] [INFO] [axolotl.train.save_initial_configs:407] [PID:4001] Pre-saving model config to ./model-output...
[2025-10-25 17:50:01,534] [INFO] [axolotl.train.execute_training:196] [PID:4001] Starting trainer...
[2025-10-25 17:50:02,423] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32518529891967773
[2025-10-25 17:50:02,751] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3276631832122803
[2025-10-25 17:50:03,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3279576301574707
[2025-10-25 17:50:03,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32717275619506836
[2025-10-25 17:50:03,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46]
[34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...

[Am[2K
[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run f79oi2ub (0.1s)

[Am[2K
[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run f79oi2ub (0.1s)

[Am[2K
[34m[1mwandb[0m: Tracking run with wandb version 0.22.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20251025_175003-f79oi2ub[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mquiet-snowflake-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup/runs/f79oi2ub[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-10-25 17:50:05,900] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:4001] The Axolotl config has been saved to the WandB run under files.

  0%|                                                                         | 0/22 [00:00<?, ?it/s]
  5%|██▉                                                              | 1/22 [00:17<06:07, 17.49s/it]
                                                                                                     
{'loss': 3.3053, 'grad_norm': 24.125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.49, 'memory/max_allocated (GiB)': 18.49, 'memory/device_reserved (GiB)': 21.18, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.09}

  5%|██▉                                                              | 1/22 [00:17<06:07, 17.49s/it]
  9%|█████▉                                                           | 2/22 [00:27<04:24, 13.21s/it]
                                                                                                     
{'loss': 2.9641, 'grad_norm': 12.0, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 704.32, 'epoch': 0.17}

  9%|█████▉                                                           | 2/22 [00:27<04:24, 13.21s/it]
 14%|████████▊                                                        | 3/22 [00:37<03:43, 11.76s/it]
                                                                                                     
{'loss': 2.8185, 'grad_norm': 8.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 731.68, 'epoch': 0.26}

 14%|████████▊                                                        | 3/22 [00:37<03:43, 11.76s/it]
 18%|███████████▊                                                     | 4/22 [00:47<03:19, 11.08s/it]
                                                                                                     
{'loss': 2.8666, 'grad_norm': 6.46875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 728.47, 'epoch': 0.35}

 18%|███████████▊                                                     | 4/22 [00:47<03:19, 11.08s/it]
 23%|██████████████▊                                                  | 5/22 [00:57<03:02, 10.71s/it]
                                                                                                     
{'loss': 2.7515, 'grad_norm': 5.875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 716.16, 'epoch': 0.43}

 23%|██████████████▊                                                  | 5/22 [00:57<03:02, 10.71s/it]
 27%|█████████████████▋                                               | 6/22 [01:08<02:49, 10.62s/it]
                                                                                                     
{'loss': 2.7633, 'grad_norm': 5.53125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 681.72, 'epoch': 0.52}

 27%|█████████████████▋                                               | 6/22 [01:08<02:49, 10.62s/it][2025-10-25 17:51:14,180] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-6

 32%|████████████████████▋                                            | 7/22 [01:28<03:27, 13.81s/it]
                                                                                                     
{'loss': 2.6924, 'grad_norm': 5.59375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 699.03, 'epoch': 0.61}

 32%|████████████████████▋                                            | 7/22 [01:28<03:27, 13.81s/it]
 36%|███████████████████████▋                                         | 8/22 [01:38<02:56, 12.62s/it]
                                                                                                     
{'loss': 2.7207, 'grad_norm': 5.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 740.89, 'epoch': 0.7}

 36%|███████████████████████▋                                         | 8/22 [01:38<02:56, 12.62s/it]
 41%|██████████████████████████▌                                      | 9/22 [01:48<02:33, 11.82s/it]
                                                                                                     
{'loss': 2.6885, 'grad_norm': 4.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 718.9, 'epoch': 0.78}

 41%|██████████████████████████▌                                      | 9/22 [01:48<02:33, 11.82s/it]
 45%|█████████████████████████████                                   | 10/22 [01:58<02:15, 11.29s/it]
                                                                                                     
{'loss': 2.6238, 'grad_norm': 4.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 730.17, 'epoch': 0.87}

 45%|█████████████████████████████                                   | 10/22 [01:58<02:15, 11.29s/it]
 50%|████████████████████████████████                                | 11/22 [02:08<02:00, 10.92s/it]
                                                                                                     
{'loss': 2.6501, 'grad_norm': 4.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 702.7, 'epoch': 0.96}

 50%|████████████████████████████████                                | 11/22 [02:08<02:00, 10.92s/it]
 55%|██████████████████████████████████▉                             | 12/22 [02:16<01:37,  9.78s/it]
                                                                                                     
{'loss': 2.7149, 'grad_norm': 6.96875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 286.86, 'epoch': 1.0}

 55%|██████████████████████████████████▉                             | 12/22 [02:16<01:37,  9.78s/it][2025-10-25 17:52:22,046] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-12

 59%|█████████████████████████████████████▊                          | 13/22 [02:38<02:02, 13.56s/it]
                                                                                                     
{'loss': 2.5872, 'grad_norm': 4.375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 662.73, 'epoch': 1.09}

 59%|█████████████████████████████████████▊                          | 13/22 [02:38<02:02, 13.56s/it]
 64%|████████████████████████████████████████▋                       | 14/22 [02:48<01:40, 12.52s/it]
                                                                                                     
{'loss': 2.532, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 737.0, 'epoch': 1.17}

 64%|████████████████████████████████████████▋                       | 14/22 [02:48<01:40, 12.52s/it]
 68%|███████████████████████████████████████████▋                    | 15/22 [02:58<01:22, 11.78s/it]
                                                                                                     
{'loss': 2.4174, 'grad_norm': 3.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.14, 'epoch': 1.26}

 68%|███████████████████████████████████████████▋                    | 15/22 [02:58<01:22, 11.78s/it]
 73%|██████████████████████████████████████████████▌                 | 16/22 [03:08<01:07, 11.28s/it]
                                                                                                     
{'loss': 2.4644, 'grad_norm': 4.09375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.69, 'epoch': 1.35}

 73%|██████████████████████████████████████████████▌                 | 16/22 [03:08<01:07, 11.28s/it]
 77%|█████████████████████████████████████████████████▍              | 17/22 [03:18<00:54, 10.94s/it]
                                                                                                     
{'loss': 2.5299, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 713.56, 'epoch': 1.43}

 77%|█████████████████████████████████████████████████▍              | 17/22 [03:18<00:54, 10.94s/it]
 82%|████████████████████████████████████████████████████▎           | 18/22 [03:28<00:42, 10.70s/it]
                                                                                                     
{'loss': 2.4902, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 674.14, 'epoch': 1.52}

 82%|████████████████████████████████████████████████████▎           | 18/22 [03:28<00:42, 10.70s/it][2025-10-25 17:53:34,885] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-18

 86%|███████████████████████████████████████████████████████▎        | 19/22 [03:48<00:40, 13.41s/it]
                                                                                                     
{'loss': 2.4657, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 655.58, 'epoch': 1.61}

 86%|███████████████████████████████████████████████████████▎        | 19/22 [03:48<00:40, 13.41s/it]
 91%|██████████████████████████████████████████████████████████▏     | 20/22 [03:58<00:24, 12.42s/it]
                                                                                                     
{'loss': 2.4085, 'grad_norm': 4.0625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 709.39, 'epoch': 1.7}

 91%|██████████████████████████████████████████████████████████▏     | 20/22 [03:58<00:24, 12.42s/it]
 95%|█████████████████████████████████████████████████████████████   | 21/22 [04:08<00:11, 11.73s/it]
                                                                                                     
{'loss': 2.3577, 'grad_norm': 3.796875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 691.03, 'epoch': 1.78}

 95%|█████████████████████████████████████████████████████████████   | 21/22 [04:08<00:11, 11.73s/it]
100%|████████████████████████████████████████████████████████████████| 22/22 [04:19<00:00, 11.26s/it]
                                                                                                     
{'loss': 2.3456, 'grad_norm': 3.953125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 707.03, 'epoch': 1.87}

100%|████████████████████████████████████████████████████████████████| 22/22 [04:19<00:00, 11.26s/it][2025-10-25 17:54:25,016] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-22

                                                                                                     
{'train_runtime': 271.0135, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.081, 'train_loss': 2.643556302244013, 'memory/max_active (GiB)': 7.67, 'memory/max_allocated (GiB)': 7.67, 'memory/device_reserved (GiB)': 21.19, 'epoch': 1.87}

100%|████████████████████████████████████████████████████████████████| 22/22 [04:28<00:00, 11.26s/it]
100%|████████████████████████████████████████████████████████████████| 22/22 [04:28<00:00, 12.21s/it]
[2025-10-25 17:54:34,884] [INFO] [axolotl.train.save_trained_model:218] [PID:4001] Training completed! Saving trained model to ./model-output.
[2025-10-25 17:54:44,496] [INFO] [axolotl.train.save_trained_model:336] [PID:4001] Model successfully saved to ./model-output
[0m