File size: 15,768 Bytes

dd11053

[2026-02-21 20:37:21,574] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:5467] baseline 0.000GB (+0.000GB allocated)
[2026-02-21 20:37:21,574] [INFO] [axolotl.cli.config.load_cfg:259] [PID:5467] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "src/training/qwen3_axolotl_config.yml",
  "base_model": "Qwen/Qwen2-0.5B",
  "base_model_config": "Qwen/Qwen2-0.5B",
  "batch_size": 8,
  "bf16": false,
  "capabilities": {
    "bf16": true,
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 8,
  "dataset_prepared_path": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/last_run_prepared",
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/src/training/data_splits_axolotl/train_axolotl.jsonl",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": false,
  "device": "mps",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.10.0"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "fp8": false,
  "gradient_accumulation_steps": 8,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": false,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 10,
  "lora_alpha": 16,
  "lora_dropout": 0.05,
  "lora_modules_to_save": [
    "embed_tokens",
    "lm_head"
  ],
  "lora_r": 8,
  "lora_target_modules": [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "qwen2",
  "num_epochs": 2.0,
  "optimizer": "adamw_torch",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 15,
  "save_total_limit": 2,
  "sequence_len": 512,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen2-0.5B",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.float32",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_mode": "disabled",
  "warmup_steps": 50,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-02-21 20:37:22,676] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:5467] EOS: 151643 / <|endoftext|>
[2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:5467] BOS: None / None
[2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:5467] PAD: 151643 / <|endoftext|>
[2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:5467] UNK: None / None
[2026-02-21 20:37:22,679] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:5467] Loading prepared dataset from disk at /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/last_run_prepared/41f31c3c9bc9eb4eb6e943fbbbb74dda...
[2026-02-21 20:37:22,701] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:5467] total_num_tokens: 12_122
[2026-02-21 20:37:22,703] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:5467] `total_supervised_tokens: 1_660`
[2026-02-21 20:37:22,703] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:5467] total_num_steps: 49
[2026-02-21 20:37:22,703] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:5467] Maximum number of steps set at 49
[2026-02-21 20:37:22,735] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:5467] loading tokenizer... Qwen/Qwen2-0.5B
[2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:5467] EOS: 151643 / <|endoftext|>
[2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:5467] BOS: None / None
[2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:5467] PAD: 151643 / <|endoftext|>
[2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:5467] UNK: None / None
[2026-02-21 20:37:23,594] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:5467] Loading model
[2026-02-21 20:37:23,742] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:5467] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-02-21 20:37:23,744] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:5467] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-02-21 20:37:23,752] [WARNING] [transformers.modeling_utils.warning_once:328] [PID:5467] `torch_dtype` is deprecated! Use `dtype` instead!
[2026-02-21 20:37:27,045] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:853] [PID:5467] converting PEFT model w/ prepare_model_for_kbit_training
[2026-02-21 20:37:27,047] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:5467] Converting modules to torch.float32
[2026-02-21 20:37:27,049] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5467] Memory usage after model load 0.000GB (+0.000GB allocated)
[2026-02-21 20:37:27,053] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/peft/tuners/tuners_utils.py:1225: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
  warnings.warn(msg)

trainable params: 276,668,416 || all params: 770,701,184 || trainable%: 35.8983
[2026-02-21 20:37:27,656] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5467] after adapters 0.000GB (+0.000GB allocated)
[2026-02-21 20:37:33,551] [INFO] [axolotl.train.save_initial_configs:413] [PID:5467] Pre-saving adapter config to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output...
[2026-02-21 20:37:33,553] [INFO] [axolotl.train.save_initial_configs:417] [PID:5467] Pre-saving tokenizer to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output...
[2026-02-21 20:37:33,733] [INFO] [axolotl.train.save_initial_configs:422] [PID:5467] Pre-saving model config to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output...
[2026-02-21 20:37:33,737] [INFO] [axolotl.train.execute_training:212] [PID:5467] Starting trainer...
[2026-02-21 20:37:33,737] [WARNING] [transformers.trainer._align_special_tokens:982] [PID:5467] The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.

  0%|          | 0/49 [00:00<?, ?it/s][2026-02-21 20:37:34,099] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:1118: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, device pinned memory won't be used.
  super().__init__(loader)


  2%|▏         | 1/49 [00:49<39:37, 49.52s/it]
  4%|▍         | 2/49 [01:38<38:45, 49.48s/it]
  6%|▌         | 3/49 [02:39<41:49, 54.56s/it]
  8%|▊         | 4/49 [03:19<36:40, 48.90s/it]
 10%|█         | 5/49 [04:06<35:15, 48.07s/it]
 12%|█▏        | 6/49 [04:46<32:31, 45.39s/it]
 14%|█▍        | 7/49 [05:33<32:04, 45.82s/it]
 16%|█▋        | 8/49 [06:19<31:27, 46.03s/it]
 18%|█▊        | 9/49 [06:59<29:27, 44.20s/it]
 20%|██        | 10/49 [07:39<27:45, 42.71s/it]
                                               
{'loss': 2.1003, 'grad_norm': 29.422929763793945, 'learning_rate': 3.6e-05, 'ppl': 8.16862, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.3365485370159149, 'tokens/total': 40960, 'tokens/trainable': 732, 'epoch': 0.41}

 20%|██        | 10/49 [07:43<27:45, 42.71s/it]
 22%|██▏       | 11/49 [08:20<26:48, 42.33s/it]
 24%|██▍       | 12/49 [09:11<27:37, 44.80s/it]
 27%|██▋       | 13/49 [09:56<27:02, 45.07s/it]
 29%|██▊       | 14/49 [10:45<26:55, 46.15s/it]
 31%|███       | 15/49 [11:29<25:49, 45.58s/it][2026-02-21 20:49:03,976] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-15

 33%|███▎      | 16/49 [12:34<28:15, 51.37s/it]
 35%|███▍      | 17/49 [13:17<26:04, 48.90s/it]
 37%|███▋      | 18/49 [14:01<24:26, 47.32s/it]
 39%|███▉      | 19/49 [14:52<24:09, 48.31s/it]
 41%|████      | 20/49 [15:42<23:36, 48.83s/it]
                                               
{'loss': 1.3313, 'grad_norm': 20.819232940673828, 'learning_rate': 7.6e-05, 'ppl': 3.78596, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.13703587651252747, 'tokens/total': 81920, 'tokens/trainable': 1375, 'epoch': 0.82}

 41%|████      | 20/49 [15:47<23:36, 48.83s/it]
 43%|████▎     | 21/49 [16:32<22:58, 49.25s/it]
 45%|████▍     | 22/49 [17:12<20:53, 46.42s/it]
 47%|████▋     | 23/49 [17:55<19:46, 45.63s/it]
 49%|████▉     | 24/49 [18:36<18:24, 44.19s/it]
 51%|█████     | 25/49 [19:00<15:11, 37.96s/it][2026-02-21 20:56:34,355] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:1118: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, device pinned memory won't be used.
  super().__init__(loader)


 53%|█████▎    | 26/49 [19:50<16:01, 41.81s/it]
 55%|█████▌    | 27/49 [20:32<15:20, 41.84s/it]
 57%|█████▋    | 28/49 [21:19<15:08, 43.24s/it]
 59%|█████▉    | 29/49 [22:10<15:13, 45.67s/it]
 61%|██████    | 30/49 [23:16<16:22, 51.72s/it]
                                               
{'loss': 0.8107, 'grad_norm': 10.385747909545898, 'learning_rate': 0.000116, 'ppl': 2.24948, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.2631860375404358, 'tokens/total': 119808, 'tokens/trainable': 1990, 'epoch': 1.21}

 61%|██████    | 30/49 [23:22<16:22, 51.72s/it][2026-02-21 21:00:56,360] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-30

 63%|██████▎   | 31/49 [24:24<16:58, 56.56s/it]
 65%|██████▌   | 32/49 [25:20<16:00, 56.51s/it]
 67%|██████▋   | 33/49 [26:27<15:51, 59.48s/it]
 69%|██████▉   | 34/49 [27:21<14:26, 57.79s/it]
 71%|███████▏  | 35/49 [28:10<12:52, 55.20s/it]
 73%|███████▎  | 36/49 [28:50<11:00, 50.77s/it]
 76%|███████▌  | 37/49 [29:32<09:37, 48.10s/it]
 78%|███████▊  | 38/49 [30:19<08:46, 47.83s/it]
 80%|███████▉  | 39/49 [31:07<07:57, 47.79s/it]
 82%|████████▏ | 40/49 [31:54<07:07, 47.45s/it]
                                               
{'loss': 0.4243, 'grad_norm': 9.247703552246094, 'learning_rate': 0.00015600000000000002, 'ppl': 1.52852, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.12217021733522415, 'tokens/total': 160768, 'tokens/trainable': 2671, 'epoch': 1.62}

 82%|████████▏ | 40/49 [31:59<07:07, 47.45s/it]
 84%|████████▎ | 41/49 [33:01<07:06, 53.31s/it]
 86%|████████▌ | 42/49 [33:56<06:17, 53.91s/it]
 88%|████████▊ | 43/49 [34:57<05:35, 55.99s/it]
 90%|████████▉ | 44/49 [35:49<04:34, 54.85s/it]
 92%|█████████▏| 45/49 [36:49<03:45, 56.49s/it][2026-02-21 21:14:23,862] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-45

 94%|█████████▍| 46/49 [38:04<03:05, 61.88s/it]
 96%|█████████▌| 47/49 [38:53<01:56, 58.24s/it]
 98%|█████████▊| 48/49 [39:49<00:57, 57.40s/it]
100%|██████████| 49/49 [40:29<00:00, 52.10s/it][2026-02-21 21:18:03,251] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-49

                                               
{'train_runtime': 2450.0653, 'train_samples_per_second': 0.16, 'train_steps_per_second': 0.02, 'train_loss': 1.0411046378466549, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'epoch': 1.99, 'tokens/train_per_sec_per_gpu': 0.19685673713684082}

100%|██████████| 49/49 [40:50<00:00, 52.10s/it]
100%|██████████| 49/49 [40:50<00:00, 50.00s/it]
[2026-02-21 21:18:29,199] [INFO] [axolotl.train.save_trained_model:233] [PID:5467] Training completed! Saving trained model to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output.
[2026-02-21 21:18:31,934] [INFO] [axolotl.train.save_trained_model:351] [PID:5467] Model successfully saved to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output