File size: 34,952 Bytes
a5c27d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
[2025-10-22 16:34:24,607] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:2199] bf16 support detected, enabling for this configuration.
[2025-10-22 16:34:24,749] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:2199] baseline 0.000GB ()
[2025-10-22 16:34:24,749] [INFO] [axolotl.cli.config.load_cfg:248] [PID:2199] config:
{
"activation_offloading": false,
"adapter": "lora",
"axolotl_config_path": "config.yaml",
"base_model": "Qwen/Qwen2.5-7B-Instruct",
"base_model_config": "Qwen/Qwen2.5-7B-Instruct",
"batch_size": 16,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": false,
"n_gpu": 1,
"n_node": 1
},
"context_parallel_size": 1,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_processes": 32,
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "/workspace/fine-tuning/data/trump.json",
"trust_remote_code": false,
"type": "alpaca"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.7.1"
},
"eval_batch_size": 16,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": false,
"include_tkps": true,
"learning_rate": 0.0002,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": true,
"local_rank": 0,
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_r": 8,
"lora_target_modules": [
"q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"micro_batch_size": 16,
"model_config_type": "qwen2",
"num_epochs": 1.0,
"optimizer": "adamw_bnb_8bit",
"output_dir": "./outputs/thoth_text_v2",
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"sequence_len": 4096,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen2.5-7B-Instruct",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"weight_decay": 0.0,
"world_size": 1
}
[2025-10-22 16:34:25,444] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:2199] EOS: 151645 / <|im_end|>
[2025-10-22 16:34:25,444] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:2199] BOS: None / None
[2025-10-22 16:34:25,444] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:2199] PAD: 151643 / <|endoftext|>
[2025-10-22 16:34:25,444] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:2199] UNK: None / None
[2025-10-22 16:34:25,445] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:2199] Unable to find prepared dataset in last_run_prepared/8eaeba3b90268710df560cd4dfe04e2d
[2025-10-22 16:34:25,445] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:2199] Loading raw datasets...
[2025-10-22 16:34:25,445] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:2199] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2025-10-22 16:34:25,754] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:2199] Loading dataset: /workspace/fine-tuning/data/trump.json with base_type: alpaca and prompt_style: None
[2025-10-22 16:34:25,754] [WARNING] [datasets.arrow_dataset.map:3100] [PID:2199] num_proc must be <= 15. Reducing num_proc to 15 for dataset of size 15.
Tokenizing Prompts (num_proc=15): 0%| | 0/15 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=15): 7%|βββββ | 1/15 [00:00<00:08, 1.56 examples/s]
Tokenizing Prompts (num_proc=15): 13%|βββββββββββ | 2/15 [00:00<00:04, 3.01 examples/s]
Tokenizing Prompts (num_proc=15): 27%|βββββββββββββββββββββ | 4/15 [00:00<00:01, 5.98 examples/s]
Tokenizing Prompts (num_proc=15): 47%|ββββββββββββββββββββββββββββββββββββ | 7/15 [00:01<00:00, 9.94 examples/s]
Tokenizing Prompts (num_proc=15): 60%|ββββββββββββββββββββββββββββββββββββββββββββββ | 9/15 [00:01<00:00, 11.64 examples/s]
Tokenizing Prompts (num_proc=15): 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 11/15 [00:01<00:00, 12.43 examples/s]
Tokenizing Prompts (num_proc=15): 87%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 13/15 [00:01<00:00, 12.76 examples/s]
Tokenizing Prompts (num_proc=15): 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:01<00:00, 14.05 examples/s]
Tokenizing Prompts (num_proc=15): 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:01<00:00, 8.67 examples/s]
[2025-10-22 16:34:27,600] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:2199] min_input_len: 46
[2025-10-22 16:34:27,600] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:2199] max_input_len: 87
[2025-10-22 16:34:27,601] [WARNING] [datasets.arrow_dataset.map:3100] [PID:2199] num_proc must be <= 15. Reducing num_proc to 15 for dataset of size 15.
Dropping Long Sequences (>4096) (num_proc=15): 0%| | 0/15 [00:00<?, ? examples/s]
Dropping Long Sequences (>4096) (num_proc=15): 7%|βββββ | 1/15 [00:00<00:03, 4.29 examples/s]
Dropping Long Sequences (>4096) (num_proc=15): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:00<00:00, 34.65 examples/s]
Saving the dataset (0/1 shards): 0%| | 0/15 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:00<00:00, 1508.60 examples/s]
Saving the dataset (1/1 shards): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:00<00:00, 1468.01 examples/s]
[2025-10-22 16:34:28,163] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:2199] total_num_tokens: 1_016
[2025-10-22 16:34:28,164] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:2199] `total_supervised_tokens: 386`
[2025-10-22 16:34:28,164] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:2199] total_num_steps: 1
[2025-10-22 16:34:28,164] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:2199] Maximum number of steps set at 1
[2025-10-22 16:34:28,208] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:2199] Loading tokenizer... Qwen/Qwen2.5-7B-Instruct
[2025-10-22 16:34:28,772] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:2199] EOS: 151645 / <|im_end|>
[2025-10-22 16:34:28,772] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:2199] BOS: None / None
[2025-10-22 16:34:28,772] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:2199] PAD: 151643 / <|endoftext|>
[2025-10-22 16:34:28,772] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:2199] UNK: None / None
[2025-10-22 16:34:28,772] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:2199] Loading model
[2025-10-22 16:34:28,893] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:2199] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-22 16:34:28,895] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:2199] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 27.8kB [00:00, 71.6MB/s]
model-00001-of-00004.safetensors: 0%| | 0.00/3.95G [00:00<?, ?B/s]
model-00001-of-00004.safetensors: 0%| | 52.0k/3.95G [00:01<35:38:11, 30.8kB/s]
model-00001-of-00004.safetensors: 1%|β | 52.6M/3.95G [00:01<01:34, 41.0MB/s]
model-00001-of-00004.safetensors: 2%|ββ | 83.8M/3.95G [00:02<01:22, 46.6MB/s]
model-00001-of-00004.safetensors: 6%|βββββ | 218M/3.95G [00:02<00:23, 157MB/s]
model-00001-of-00004.safetensors: 11%|βββββββββ | 446M/3.95G [00:02<00:09, 389MB/s]
model-00001-of-00004.safetensors: 14%|ββββββββββββ | 571M/3.95G [00:02<00:07, 458MB/s]
model-00001-of-00004.safetensors: 18%|βββββββββββββββ | 728M/3.95G [00:02<00:06, 533MB/s]
model-00001-of-00004.safetensors: 21%|βββββββββββββββββ | 835M/3.95G [00:03<00:06, 500MB/s]
model-00001-of-00004.safetensors: 23%|βββββββββββββββββββ | 917M/3.95G [00:03<00:07, 413MB/s]
model-00001-of-00004.safetensors: 25%|ββββββββββββββββββββ | 998M/3.95G [00:03<00:06, 445MB/s]
model-00001-of-00004.safetensors: 29%|βββββββββββββββββββββββ | 1.13G/3.95G [00:03<00:05, 496MB/s]
model-00001-of-00004.safetensors: 32%|ββββββββββββββββββββββββββ | 1.28G/3.95G [00:04<00:04, 647MB/s]
model-00001-of-00004.safetensors: 36%|ββββββββββββββββββββββββββββ | 1.41G/3.95G [00:04<00:03, 675MB/s]
model-00001-of-00004.safetensors: 38%|ββββββββββββββββββββββββββββββ | 1.51G/3.95G [00:04<00:03, 713MB/s]
model-00001-of-00004.safetensors: 42%|βββββββββββββββββββββββββββββββββ | 1.67G/3.95G [00:04<00:02, 892MB/s]
model-00001-of-00004.safetensors: 49%|ββββββββββββββββββββββββββββββββββββββ | 1.92G/3.95G [00:04<00:01, 1.19GB/s]
model-00001-of-00004.safetensors: 54%|ββββββββββββββββββββββββββββββββββββββββββ | 2.11G/3.95G [00:04<00:01, 1.07GB/s]
model-00001-of-00004.safetensors: 60%|ββββββββββββββββββββββββββββββββββββββββββββββ | 2.35G/3.95G [00:04<00:01, 1.21GB/s]
model-00001-of-00004.safetensors: 63%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 2.48G/3.95G [00:05<00:01, 1.18GB/s]
model-00001-of-00004.safetensors: 67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.65G/3.95G [00:05<00:01, 1.14GB/s]
model-00001-of-00004.safetensors: 78%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.06G/3.95G [00:05<00:00, 1.75GB/s]
model-00001-of-00004.safetensors: 85%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.36G/3.95G [00:05<00:00, 1.88GB/s]
model-00001-of-00004.safetensors: 91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.61G/3.95G [00:05<00:00, 1.99GB/s]
model-00001-of-00004.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.95G/3.95G [00:05<00:00, 2.22GB/s]
model-00001-of-00004.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.95G/3.95G [00:05<00:00, 697MB/s]
model-00002-of-00004.safetensors: 0%| | 0.00/3.86G [00:00<?, ?B/s]
model-00002-of-00004.safetensors: 0%| | 3.45M/3.86G [00:01<32:49, 1.96MB/s]
model-00002-of-00004.safetensors: 3%|βββ | 108M/3.86G [00:02<01:04, 58.4MB/s]
model-00002-of-00004.safetensors: 9%|βββββββ | 338M/3.86G [00:02<00:18, 190MB/s]
model-00002-of-00004.safetensors: 10%|ββββββββ | 384M/3.86G [00:02<00:16, 206MB/s]
model-00002-of-00004.safetensors: 15%|ββββββββββββ | 576M/3.86G [00:02<00:08, 377MB/s]
model-00002-of-00004.safetensors: 18%|βββββββββββββββ | 709M/3.86G [00:03<00:06, 496MB/s]
model-00002-of-00004.safetensors: 28%|ββββββββββββββββββββββ | 1.08G/3.86G [00:03<00:02, 945MB/s]
model-00002-of-00004.safetensors: 33%|ββββββββββββββββββββββββββ | 1.26G/3.86G [00:03<00:02, 940MB/s]
model-00002-of-00004.safetensors: 37%|ββββββββββββββββββββββββββββββ | 1.45G/3.86G [00:03<00:02, 983MB/s]
model-00002-of-00004.safetensors: 42%|βββββββββββββββββββββββββββββββββ | 1.64G/3.86G [00:03<00:02, 1.01GB/s]
model-00002-of-00004.safetensors: 48%|βββββββββββββββββββββββββββββββββββββ | 1.84G/3.86G [00:03<00:01, 1.11GB/s]
model-00002-of-00004.safetensors: 54%|ββββββββββββββββββββββββββββββββββββββββββ | 2.07G/3.86G [00:03<00:01, 1.24GB/s]
model-00002-of-00004.safetensors: 60%|ββββββββββββββββββββββββββββββββββββββββββββββ | 2.30G/3.86G [00:04<00:01, 1.42GB/s]
model-00002-of-00004.safetensors: 73%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.81G/3.86G [00:04<00:00, 2.16GB/s]
model-00002-of-00004.safetensors: 80%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.07G/3.86G [00:04<00:00, 1.90GB/s]
model-00002-of-00004.safetensors: 86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.34G/3.86G [00:04<00:00, 1.68GB/s]
model-00002-of-00004.safetensors: 97%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.75G/3.86G [00:04<00:00, 2.08GB/s]
model-00002-of-00004.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.86G/3.86G [00:05<00:00, 703MB/s]
model-00003-of-00004.safetensors: 0%| | 0.00/3.86G [00:00<?, ?B/s]
model-00003-of-00004.safetensors: 0%| | 1.37M/3.86G [00:01<1:18:05, 824kB/s]
model-00003-of-00004.safetensors: 1%|β | 50.0M/3.86G [00:02<02:08, 29.6MB/s]
model-00003-of-00004.safetensors: 6%|βββββ | 243M/3.86G [00:02<00:23, 152MB/s]
model-00003-of-00004.safetensors: 10%|ββββββββ | 388M/3.86G [00:02<00:13, 254MB/s]
model-00003-of-00004.safetensors: 12%|ββββββββββ | 463M/3.86G [00:03<00:14, 234MB/s]
model-00003-of-00004.safetensors: 15%|ββββββββββββ | 564M/3.86G [00:03<00:11, 279MB/s]
model-00003-of-00004.safetensors: 21%|βββββββββββββββββ | 797M/3.86G [00:03<00:06, 511MB/s]
model-00003-of-00004.safetensors: 28%|βββββββββββββββββββββββ | 1.10G/3.86G [00:03<00:03, 806MB/s]
model-00003-of-00004.safetensors: 35%|ββββββββββββββββββββββββββββ | 1.37G/3.86G [00:03<00:02, 1.00GB/s]
model-00003-of-00004.safetensors: 42%|βββββββββββββββββββββββββββββββββ | 1.63G/3.86G [00:03<00:01, 1.27GB/s]
model-00003-of-00004.safetensors: 47%|βββββββββββββββββββββββββββββββββββββ | 1.83G/3.86G [00:04<00:01, 1.13GB/s]
model-00003-of-00004.safetensors: 68%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.63G/3.86G [00:04<00:00, 2.36GB/s]
model-00003-of-00004.safetensors: 77%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.97G/3.86G [00:04<00:00, 2.15GB/s]
model-00003-of-00004.safetensors: 85%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.29G/3.86G [00:04<00:00, 1.95GB/s]
model-00003-of-00004.safetensors: 95%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.68G/3.86G [00:04<00:00, 2.19GB/s]
model-00003-of-00004.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.86G/3.86G [00:05<00:00, 704MB/s]
model-00004-of-00004.safetensors: 0%| | 0.00/3.56G [00:00<?, ?B/s]
model-00004-of-00004.safetensors: 0%| | 11.9k/3.56G [00:01<134:42:23, 7.33kB/s]
model-00004-of-00004.safetensors: 0%|β | 12.1M/3.56G [00:02<07:39, 7.72MB/s]
model-00004-of-00004.safetensors: 3%|ββ | 95.0M/3.56G [00:02<00:45, 75.9MB/s]
model-00004-of-00004.safetensors: 5%|ββββ | 166M/3.56G [00:02<00:24, 139MB/s]
model-00004-of-00004.safetensors: 9%|ββββββββ | 323M/3.56G [00:02<00:10, 322MB/s]
model-00004-of-00004.safetensors: 12%|ββββββββββ | 419M/3.56G [00:02<00:09, 327MB/s]
model-00004-of-00004.safetensors: 17%|ββββββββββββββ | 617M/3.56G [00:02<00:05, 557MB/s]
model-00004-of-00004.safetensors: 21%|βββββββββββββββββ | 750M/3.56G [00:03<00:05, 524MB/s]
model-00004-of-00004.safetensors: 26%|βββββββββββββββββββββ | 917M/3.56G [00:03<00:04, 537MB/s]
model-00004-of-00004.safetensors: 30%|ββββββββββββββββββββββββ | 1.07G/3.56G [00:03<00:03, 649MB/s]
model-00004-of-00004.safetensors: 45%|βββββββββββββββββββββββββββββββββββ | 1.61G/3.56G [00:03<00:01, 1.43GB/s]
model-00004-of-00004.safetensors: 56%|ββββββββββββββββββββββββββββββββββββββββββββ | 1.99G/3.56G [00:03<00:00, 1.65GB/s]
model-00004-of-00004.safetensors: 65%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.32G/3.56G [00:03<00:00, 1.90GB/s]
model-00004-of-00004.safetensors: 73%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.58G/3.56G [00:04<00:00, 1.45GB/s]
model-00004-of-00004.safetensors: 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.89G/3.56G [00:04<00:00, 1.55GB/s]
model-00004-of-00004.safetensors: 87%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.09G/3.56G [00:04<00:00, 1.48GB/s]
model-00004-of-00004.safetensors: 93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.29G/3.56G [00:04<00:00, 1.34GB/s]
model-00004-of-00004.safetensors: 98%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.49G/3.56G [00:04<00:00, 1.35GB/s]
model-00004-of-00004.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.56G/3.56G [00:04<00:00, 727MB/s]
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 25%|βββββββββββββββββββββββ | 1/4 [00:04<00:12, 4.04s/it]
Loading checkpoint shards: 50%|ββββββββββββββββββββββββββββββββββββββββββββββ | 2/4 [00:09<00:09, 4.79s/it]
Loading checkpoint shards: 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3/4 [00:14<00:05, 5.01s/it]
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:18<00:00, 4.46s/it]
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:18<00:00, 4.56s/it]
generation_config.json: 0%| | 0.00/243 [00:00<?, ?B/s]
generation_config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 243/243 [00:00<00:00, 2.84MB/s]
[2025-10-22 16:35:11,043] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:2199] converting PEFT model w/ prepare_model_for_kbit_training
[2025-10-22 16:35:11,045] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:2199] Converting modules to torch.bfloat16
[2025-10-22 16:35:11,047] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2199] Memory usage after model load 11.676GB (+11.676GB allocated, +13.172GB reserved)
trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643
[2025-10-22 16:35:11,254] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2199] after adapters 8.565GB (+8.565GB allocated, +13.248GB reserved)
[2025-10-22 16:35:16,678] [INFO] [axolotl.train.save_initial_configs:398] [PID:2199] Pre-saving adapter config to ./outputs/thoth_text_v2...
[2025-10-22 16:35:16,678] [INFO] [axolotl.train.save_initial_configs:402] [PID:2199] Pre-saving tokenizer to ./outputs/thoth_text_v2...
[2025-10-22 16:35:16,788] [INFO] [axolotl.train.save_initial_configs:407] [PID:2199] Pre-saving model config to ./outputs/thoth_text_v2...
[2025-10-22 16:35:16,790] [INFO] [axolotl.train.execute_training:196] [PID:2199] Starting trainer...
0%| | 0/1 [00:00<?, ?it/s][2025-10-22 16:35:17,741] [WARNING] [py.warnings._showwarnmsg:110] [PID:2199] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:01<00:00, 1.18s/it]
{'loss': 3.0765, 'grad_norm': 2.254563570022583, 'learning_rate': 0.0002, 'memory/max_active (GiB)': 30.16, 'memory/max_allocated (GiB)': 30.16, 'memory/device_reserved (GiB)': 30.95, 'tokens_per_second_per_gpu': 466.19, 'epoch': 1.0}
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:01<00:00, 1.18s/it][2025-10-22 16:35:18,399] [INFO] [axolotl.core.trainers.base._save:671] [PID:2199] Saving model checkpoint to ./outputs/thoth_text_v2/checkpoint-1
{'train_runtime': 1.6797, 'train_samples_per_second': 9.525, 'train_steps_per_second': 0.595, 'train_loss': 3.0764801502227783, 'memory/max_active (GiB)': 8.67, 'memory/max_allocated (GiB)': 8.67, 'memory/device_reserved (GiB)': 30.95, 'epoch': 1.0}
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:01<00:00, 1.18s/it]
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:01<00:00, 1.68s/it]
[2025-10-22 16:35:18,999] [INFO] [axolotl.train.save_trained_model:218] [PID:2199] Training completed! Saving trained model to ./outputs/thoth_text_v2.
[2025-10-22 16:35:19,335] [INFO] [axolotl.train.save_trained_model:336] [PID:2199] Model successfully saved to ./outputs/thoth_text_v2
|