| | [2026-02-21 20:37:21,574] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:5467] baseline 0.000GB (+0.000GB allocated) |
| | [2026-02-21 20:37:21,574] [INFO] [axolotl.cli.config.load_cfg:259] [PID:5467] config: |
| | { |
| | "activation_offloading": false, |
| | "adapter": "lora", |
| | "axolotl_config_path": "src/training/qwen3_axolotl_config.yml", |
| | "base_model": "Qwen/Qwen2-0.5B", |
| | "base_model_config": "Qwen/Qwen2-0.5B", |
| | "batch_size": 8, |
| | "bf16": false, |
| | "capabilities": { |
| | "bf16": true, |
| | "fp8": false, |
| | "n_gpu": 1, |
| | "n_node": 1 |
| | }, |
| | "context_parallel_size": 1, |
| | "dataloader_num_workers": 1, |
| | "dataloader_pin_memory": true, |
| | "dataloader_prefetch_factor": 256, |
| | "dataset_num_proc": 8, |
| | "dataset_prepared_path": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/last_run_prepared", |
| | "datasets": [ |
| | { |
| | "message_property_mappings": { |
| | "content": "content", |
| | "role": "role" |
| | }, |
| | "path": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/src/training/data_splits_axolotl/train_axolotl.jsonl", |
| | "trust_remote_code": false, |
| | "type": "alpaca" |
| | } |
| | ], |
| | "ddp": false, |
| | "device": "mps", |
| | "dion_rank_fraction": 1.0, |
| | "dion_rank_multiple_of": 1, |
| | "env_capabilities": { |
| | "torch_version": "2.10.0" |
| | }, |
| | "eval_batch_size": 1, |
| | "eval_causal_lm_metrics": [ |
| | "sacrebleu", |
| | "comet", |
| | "ter", |
| | "chrf" |
| | ], |
| | "eval_max_new_tokens": 128, |
| | "eval_table_size": 0, |
| | "experimental_skip_move_to_device": true, |
| | "fp16": false, |
| | "fp8": false, |
| | "gradient_accumulation_steps": 8, |
| | "gradient_checkpointing": true, |
| | "gradient_checkpointing_kwargs": { |
| | "use_reentrant": true |
| | }, |
| | "group_by_length": false, |
| | "include_tkps": true, |
| | "is_falcon_derived_model": false, |
| | "is_llama_derived_model": false, |
| | "is_mistral_derived_model": false, |
| | "learning_rate": 0.0002, |
| | "lisa_layers_attribute": "model.layers", |
| | "load_best_model_at_end": false, |
| | "load_in_4bit": true, |
| | "load_in_8bit": false, |
| | "local_rank": 0, |
| | "logging_steps": 10, |
| | "lora_alpha": 16, |
| | "lora_dropout": 0.05, |
| | "lora_modules_to_save": [ |
| | "embed_tokens", |
| | "lm_head" |
| | ], |
| | "lora_r": 8, |
| | "lora_target_modules": [ |
| | "q_proj", |
| | "k_proj", |
| | "v_proj", |
| | "o_proj", |
| | "gate_proj", |
| | "up_proj", |
| | "down_proj" |
| | ], |
| | "loraplus_lr_embedding": 1e-06, |
| | "lr_scheduler": "cosine", |
| | "mean_resizing_embeddings": false, |
| | "micro_batch_size": 1, |
| | "model_config_type": "qwen2", |
| | "num_epochs": 2.0, |
| | "optimizer": "adamw_torch", |
| | "otel_metrics_host": "localhost", |
| | "otel_metrics_port": 8000, |
| | "output_dir": "/Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output", |
| | "pad_to_sequence_len": true, |
| | "pretrain_multipack_attn": true, |
| | "profiler_steps_start": 0, |
| | "qlora_sharded_model_loading": false, |
| | "ray_num_workers": 1, |
| | "resources_per_worker": { |
| | "GPU": 1 |
| | }, |
| | "sample_packing": false, |
| | "sample_packing_bin_size": 200, |
| | "sample_packing_group_size": 100000, |
| | "save_only_model": false, |
| | "save_safetensors": true, |
| | "save_steps": 15, |
| | "save_total_limit": 2, |
| | "sequence_len": 512, |
| | "shuffle_before_merging_datasets": false, |
| | "shuffle_merged_datasets": true, |
| | "skip_prepare_dataset": false, |
| | "streaming_multipack_buffer_size": 10000, |
| | "strict": false, |
| | "tensor_parallel_size": 1, |
| | "tf32": false, |
| | "tiled_mlp_use_original_mlp": true, |
| | "tokenizer_config": "Qwen/Qwen2-0.5B", |
| | "tokenizer_save_jinja_files": true, |
| | "tokenizer_type": "AutoTokenizer", |
| | "torch_dtype": "torch.float32", |
| | "train_on_inputs": false, |
| | "trl": { |
| | "log_completions": false, |
| | "mask_truncated_completions": false, |
| | "ref_model_mixup_alpha": 0.9, |
| | "ref_model_sync_steps": 64, |
| | "scale_rewards": true, |
| | "sync_ref_model": false, |
| | "use_vllm": false, |
| | "vllm_server_host": "0.0.0.0", |
| | "vllm_server_port": 8000 |
| | }, |
| | "trust_remote_code": true, |
| | "type_of_model": "AutoModelForCausalLM", |
| | "use_otel_metrics": false, |
| | "use_ray": false, |
| | "val_set_size": 0.0, |
| | "vllm": { |
| | "device": "auto", |
| | "dtype": "auto", |
| | "gpu_memory_utilization": 0.9, |
| | "host": "0.0.0.0", |
| | "port": 8000 |
| | }, |
| | "wandb_mode": "disabled", |
| | "warmup_steps": 50, |
| | "weight_decay": 0.0, |
| | "world_size": 1 |
| | } |
| | [2026-02-21 20:37:22,676] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:5467] EOS: 151643 / <|endoftext|> |
| | [2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:5467] BOS: None / None |
| | [2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:5467] PAD: 151643 / <|endoftext|> |
| | [2026-02-21 20:37:22,677] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:5467] UNK: None / None |
| | [2026-02-21 20:37:22,679] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:5467] Loading prepared dataset from disk at /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/last_run_prepared/41f31c3c9bc9eb4eb6e943fbbbb74dda... |
| | [2026-02-21 20:37:22,701] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:5467] total_num_tokens: 12_122 |
| | [2026-02-21 20:37:22,703] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:5467] `total_supervised_tokens: 1_660` |
| | [2026-02-21 20:37:22,703] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:5467] total_num_steps: 49 |
| | [2026-02-21 20:37:22,703] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:5467] Maximum number of steps set at 49 |
| | [2026-02-21 20:37:22,735] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:5467] loading tokenizer... Qwen/Qwen2-0.5B |
| | [2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:5467] EOS: 151643 / <|endoftext|> |
| | [2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:5467] BOS: None / None |
| | [2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:5467] PAD: 151643 / <|endoftext|> |
| | [2026-02-21 20:37:23,594] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:5467] UNK: None / None |
| | [2026-02-21 20:37:23,594] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:5467] Loading model |
| | [2026-02-21 20:37:23,742] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:5467] Patched Trainer.evaluation_loop with nanmean loss calculation |
| | [2026-02-21 20:37:23,744] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:5467] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation |
| | [2026-02-21 20:37:23,752] [WARNING] [transformers.modeling_utils.warning_once:328] [PID:5467] `torch_dtype` is deprecated! Use `dtype` instead! |
| | [2026-02-21 20:37:27,045] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:853] [PID:5467] converting PEFT model w/ prepare_model_for_kbit_training |
| | [2026-02-21 20:37:27,047] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:5467] Converting modules to torch.float32 |
| | [2026-02-21 20:37:27,049] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5467] Memory usage after model load 0.000GB (+0.000GB allocated) |
| | [2026-02-21 20:37:27,053] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/peft/tuners/tuners_utils.py:1225: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777 |
| | warnings.warn(msg) |
| |
|
| | trainable params: 276,668,416 || all params: 770,701,184 || trainable%: 35.8983 |
| | [2026-02-21 20:37:27,656] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:5467] after adapters 0.000GB (+0.000GB allocated) |
| | [2026-02-21 20:37:33,551] [INFO] [axolotl.train.save_initial_configs:413] [PID:5467] Pre-saving adapter config to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output... |
| | [2026-02-21 20:37:33,553] [INFO] [axolotl.train.save_initial_configs:417] [PID:5467] Pre-saving tokenizer to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output... |
| | [2026-02-21 20:37:33,733] [INFO] [axolotl.train.save_initial_configs:422] [PID:5467] Pre-saving model config to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output... |
| | [2026-02-21 20:37:33,737] [INFO] [axolotl.train.execute_training:212] [PID:5467] Starting trainer... |
| | [2026-02-21 20:37:33,737] [WARNING] [transformers.trainer._align_special_tokens:982] [PID:5467] The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}. |
| |
0%| | 0/49 [00:00<?, ?it/s][2026-02-21 20:37:34,099] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:1118: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, device pinned memory won't be used. |
| | super().__init__(loader) |
| |
|
| |
2%|β | 1/49 [00:49<39:37, 49.52s/it]
4%|β | 2/49 [01:38<38:45, 49.48s/it]
6%|β | 3/49 [02:39<41:49, 54.56s/it]
8%|β | 4/49 [03:19<36:40, 48.90s/it]
10%|β | 5/49 [04:06<35:15, 48.07s/it]
12%|ββ | 6/49 [04:46<32:31, 45.39s/it]
14%|ββ | 7/49 [05:33<32:04, 45.82s/it]
16%|ββ | 8/49 [06:19<31:27, 46.03s/it]
18%|ββ | 9/49 [06:59<29:27, 44.20s/it]
20%|ββ | 10/49 [07:39<27:45, 42.71s/it]
{'loss': 2.1003, 'grad_norm': 29.422929763793945, 'learning_rate': 3.6e-05, 'ppl': 8.16862, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.3365485370159149, 'tokens/total': 40960, 'tokens/trainable': 732, 'epoch': 0.41} |
| |
20%|ββ | 10/49 [07:43<27:45, 42.71s/it]
22%|βββ | 11/49 [08:20<26:48, 42.33s/it]
24%|βββ | 12/49 [09:11<27:37, 44.80s/it]
27%|βββ | 13/49 [09:56<27:02, 45.07s/it]
29%|βββ | 14/49 [10:45<26:55, 46.15s/it]
31%|βββ | 15/49 [11:29<25:49, 45.58s/it][2026-02-21 20:49:03,976] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-15 |
| |
33%|ββββ | 16/49 [12:34<28:15, 51.37s/it]
35%|ββββ | 17/49 [13:17<26:04, 48.90s/it]
37%|ββββ | 18/49 [14:01<24:26, 47.32s/it]
39%|ββββ | 19/49 [14:52<24:09, 48.31s/it]
41%|ββββ | 20/49 [15:42<23:36, 48.83s/it]
{'loss': 1.3313, 'grad_norm': 20.819232940673828, 'learning_rate': 7.6e-05, 'ppl': 3.78596, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.13703587651252747, 'tokens/total': 81920, 'tokens/trainable': 1375, 'epoch': 0.82} |
| |
41%|ββββ | 20/49 [15:47<23:36, 48.83s/it]
43%|βββββ | 21/49 [16:32<22:58, 49.25s/it]
45%|βββββ | 22/49 [17:12<20:53, 46.42s/it]
47%|βββββ | 23/49 [17:55<19:46, 45.63s/it]
49%|βββββ | 24/49 [18:36<18:24, 44.19s/it]
51%|βββββ | 25/49 [19:00<15:11, 37.96s/it][2026-02-21 20:56:34,355] [WARNING] [py.warnings._showwarnmsg:110] [PID:5467] /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:1118: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, device pinned memory won't be used. |
| | super().__init__(loader) |
| | |
| |
53%|ββββββ | 26/49 [19:50<16:01, 41.81s/it]
55%|ββββββ | 27/49 [20:32<15:20, 41.84s/it]
57%|ββββββ | 28/49 [21:19<15:08, 43.24s/it]
59%|ββββββ | 29/49 [22:10<15:13, 45.67s/it]
61%|ββββββ | 30/49 [23:16<16:22, 51.72s/it]
{'loss': 0.8107, 'grad_norm': 10.385747909545898, 'learning_rate': 0.000116, 'ppl': 2.24948, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.2631860375404358, 'tokens/total': 119808, 'tokens/trainable': 1990, 'epoch': 1.21} |
| |
61%|ββββββ | 30/49 [23:22<16:22, 51.72s/it][2026-02-21 21:00:56,360] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-30 |
| |
63%|βββββββ | 31/49 [24:24<16:58, 56.56s/it]
65%|βββββββ | 32/49 [25:20<16:00, 56.51s/it]
67%|βββββββ | 33/49 [26:27<15:51, 59.48s/it]
69%|βββββββ | 34/49 [27:21<14:26, 57.79s/it]
71%|ββββββββ | 35/49 [28:10<12:52, 55.20s/it]
73%|ββββββββ | 36/49 [28:50<11:00, 50.77s/it]
76%|ββββββββ | 37/49 [29:32<09:37, 48.10s/it]
78%|ββββββββ | 38/49 [30:19<08:46, 47.83s/it]
80%|ββββββββ | 39/49 [31:07<07:57, 47.79s/it]
82%|βββββββββ | 40/49 [31:54<07:07, 47.45s/it]
{'loss': 0.4243, 'grad_norm': 9.247703552246094, 'learning_rate': 0.00015600000000000002, 'ppl': 1.52852, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'tokens/train_per_sec_per_gpu': 0.12217021733522415, 'tokens/total': 160768, 'tokens/trainable': 2671, 'epoch': 1.62} |
| |
82%|βββββββββ | 40/49 [31:59<07:07, 47.45s/it]
84%|βββββββββ | 41/49 [33:01<07:06, 53.31s/it]
86%|βββββββββ | 42/49 [33:56<06:17, 53.91s/it]
88%|βββββββββ | 43/49 [34:57<05:35, 55.99s/it]
90%|βββββββββ | 44/49 [35:49<04:34, 54.85s/it]
92%|ββββββββββ| 45/49 [36:49<03:45, 56.49s/it][2026-02-21 21:14:23,862] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-45 |
| |
94%|ββββββββββ| 46/49 [38:04<03:05, 61.88s/it]
96%|ββββββββββ| 47/49 [38:53<01:56, 58.24s/it]
98%|ββββββββββ| 48/49 [39:49<00:57, 57.40s/it]
100%|ββββββββββ| 49/49 [40:29<00:00, 52.10s/it][2026-02-21 21:18:03,251] [INFO] [axolotl.core.trainers.base._save:721] [PID:5467] Saving model checkpoint to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output/checkpoint-49 |
| |
{'train_runtime': 2450.0653, 'train_samples_per_second': 0.16, 'train_steps_per_second': 0.02, 'train_loss': 1.0411046378466549, 'memory/max_active (GiB)': 4.94, 'memory/max_allocated (GiB)': 8.44, 'memory/device_reserved (GiB)': 0, 'epoch': 1.99, 'tokens/train_per_sec_per_gpu': 0.19685673713684082} |
| |
100%|ββββββββββ| 49/49 [40:50<00:00, 52.10s/it]
100%|ββββββββββ| 49/49 [40:50<00:00, 50.00s/it] |
| | [2026-02-21 21:18:29,199] [INFO] [axolotl.train.save_trained_model:233] [PID:5467] Training completed! Saving trained model to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output. |
| | [2026-02-21 21:18:31,934] [INFO] [axolotl.train.save_trained_model:351] [PID:5467] Model successfully saved to /Users/duoyun/Desktop/zsh-llm-cli-autocomplete-tool/zsh-lora-output |
| | |