|
|
[2026-01-03 15:17:19,855] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:284] bf16 support detected, enabling for this configuration. |
|
|
config.json: 0
config.json: 100 |
|
|
[2026-01-03 15:17:20,070] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:284] baseline 0.000GB () |
|
|
[2026-01-03 15:17:20,071] [INFO] [axolotl.cli.config.load_cfg:256] [PID:284] config: |
|
|
{ |
|
|
"activation_offloading": false, |
|
|
"axolotl_config_path": "config.yaml", |
|
|
"base_model": "Qwen/Qwen3-4B-Instruct-2507", |
|
|
"base_model_config": "Qwen/Qwen3-4B-Instruct-2507", |
|
|
"batch_size": 8, |
|
|
"bf16": true, |
|
|
"capabilities": { |
|
|
"bf16": true, |
|
|
"compute_capability": "sm_90", |
|
|
"fp8": false, |
|
|
"n_gpu": 1, |
|
|
"n_node": 1 |
|
|
}, |
|
|
"context_parallel_size": 1, |
|
|
"dataloader_num_workers": 1, |
|
|
"dataloader_pin_memory": true, |
|
|
"dataloader_prefetch_factor": 256, |
|
|
"dataset_num_proc": 24, |
|
|
"dataset_prepared_path": "last_run_prepared", |
|
|
"datasets": [ |
|
|
{ |
|
|
"chat_template": "chatml", |
|
|
"field_messages": "messages", |
|
|
"message_property_mappings": { |
|
|
"content": "content", |
|
|
"role": "role" |
|
|
}, |
|
|
"path": "data.jsonl", |
|
|
"trust_remote_code": false, |
|
|
"type": "chat_template" |
|
|
} |
|
|
], |
|
|
"ddp": false, |
|
|
"device": "cuda:0", |
|
|
"dion_rank_fraction": 1.0, |
|
|
"dion_rank_multiple_of": 1, |
|
|
"env_capabilities": { |
|
|
"torch_version": "2.8.0" |
|
|
}, |
|
|
"eval_batch_size": 2, |
|
|
"eval_causal_lm_metrics": [ |
|
|
"sacrebleu", |
|
|
"comet", |
|
|
"ter", |
|
|
"chrf" |
|
|
], |
|
|
"eval_max_new_tokens": 128, |
|
|
"eval_sample_packing": true, |
|
|
"eval_steps": 0.08333333333333333, |
|
|
"eval_table_size": 0, |
|
|
"evals_per_epoch": 4, |
|
|
"experimental_skip_move_to_device": true, |
|
|
"flash_attention": true, |
|
|
"fp16": false, |
|
|
"gradient_accumulation_steps": 4, |
|
|
"gradient_checkpointing": true, |
|
|
"gradient_checkpointing_kwargs": { |
|
|
"use_reentrant": true |
|
|
}, |
|
|
"group_by_length": false, |
|
|
"include_tkps": true, |
|
|
"is_falcon_derived_model": false, |
|
|
"is_llama_derived_model": false, |
|
|
"is_mistral_derived_model": false, |
|
|
"learning_rate": 2e-05, |
|
|
"lisa_layers_attribute": "model.layers", |
|
|
"load_best_model_at_end": false, |
|
|
"load_in_4bit": false, |
|
|
"load_in_8bit": false, |
|
|
"local_rank": 0, |
|
|
"logging_steps": 1, |
|
|
"lora_dropout": 0.0, |
|
|
"loraplus_lr_embedding": 1e-06, |
|
|
"lr_scheduler": "cosine", |
|
|
"mean_resizing_embeddings": false, |
|
|
"micro_batch_size": 2, |
|
|
"model_config_type": "qwen3", |
|
|
"num_epochs": 3.0, |
|
|
"optimizer": "adamw_bnb_8bit", |
|
|
"otel_metrics_host": "localhost", |
|
|
"otel_metrics_port": 8000, |
|
|
"output_dir": "./tieto-code-mini-4b-instruct", |
|
|
"pad_to_sequence_len": true, |
|
|
"pretrain_multipack_attn": true, |
|
|
"profiler_steps_start": 0, |
|
|
"qlora_sharded_model_loading": false, |
|
|
"ray_num_workers": 1, |
|
|
"resources_per_worker": { |
|
|
"GPU": 1 |
|
|
}, |
|
|
"sample_packing": true, |
|
|
"sample_packing_bin_size": 200, |
|
|
"sample_packing_group_size": 100000, |
|
|
"save_only_model": false, |
|
|
"save_safetensors": true, |
|
|
"save_steps": 0.3333333333333333, |
|
|
"saves_per_epoch": 1, |
|
|
"sequence_len": 8192, |
|
|
"shuffle_before_merging_datasets": false, |
|
|
"shuffle_merged_datasets": true, |
|
|
"skip_prepare_dataset": false, |
|
|
"streaming_multipack_buffer_size": 10000, |
|
|
"strict": false, |
|
|
"tensor_parallel_size": 1, |
|
|
"tf32": false, |
|
|
"tiled_mlp_use_original_mlp": true, |
|
|
"tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507", |
|
|
"tokenizer_save_jinja_files": true, |
|
|
"tokenizer_type": "AutoTokenizer", |
|
|
"torch_dtype": "torch.bfloat16", |
|
|
"train_on_inputs": false, |
|
|
"trl": { |
|
|
"log_completions": false, |
|
|
"mask_truncated_completions": false, |
|
|
"ref_model_mixup_alpha": 0.9, |
|
|
"ref_model_sync_steps": 64, |
|
|
"scale_rewards": true, |
|
|
"sync_ref_model": false, |
|
|
"use_vllm": false, |
|
|
"vllm_server_host": "0.0.0.0", |
|
|
"vllm_server_port": 8000 |
|
|
}, |
|
|
"trust_remote_code": true, |
|
|
"type_of_model": "AutoModelForCausalLM", |
|
|
"use_otel_metrics": false, |
|
|
"use_ray": false, |
|
|
"val_set_size": 0.05, |
|
|
"vllm": { |
|
|
"device": "auto", |
|
|
"dtype": "auto", |
|
|
"gpu_memory_utilization": 0.9, |
|
|
"host": "0.0.0.0", |
|
|
"port": 8000 |
|
|
}, |
|
|
"warmup_steps": 10, |
|
|
"weight_decay": 0.0, |
|
|
"world_size": 1 |
|
|
} |
|
|
[2026-01-03 15:17:20,074] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:284] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets. |
|
|
tokenizer_config.json: 0.00B [00:00, ?B/s]
tokenizer_config.json: 9.38kB [00:00, 14.1MB/s] |
|
|
vocab.json: 0.00B [00:00, ?B/s]
vocab.json: 2.78MB [00:00, 46.0MB/s] |
|
|
merges.txt: 0.00B [00:00, ?B/s]
merges.txt: 1.67MB [00:00, 42.9MB/s] |
|
|
tokenizer.json: 0
tokenizer.json: 100
tokenizer.json: 100 |
|
|
[2026-01-03 15:17:22,709] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <|im_end|> |
|
|
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None |
|
|
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <|endoftext|> |
|
|
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None |
|
|
[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:284] Unable to find prepared dataset in last_run_prepared/90a4bd078072b9d1de83a8db5d6b8671 |
|
|
[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:284] Loading raw datasets... |
|
|
[2026-01-03 15:17:22,714] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:284] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. |
|
|
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 503 examples [00:00, 22482.50 examples/s] |
|
|
[2026-01-03 15:17:23,108] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:284] Loading dataset: data.jsonl with base_type: chat_template and prompt_style: None |
|
|
[2026-01-03 15:17:23,136] [INFO] [axolotl.prompt_strategies.chat_template.__call__:996] [PID:284] Using chat template: |
|
|
--- |
|
|
{ |
|
|
' + message['content'] + '<|im_end|>' + ' |
|
|
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant |
|
|
' }}{% endif %} |
|
|
|
|
|
--- |
|
|
Tokenizing Prompts (num_proc=24): 0%| | 0/503 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=24): 4%|ββββββ | 21/503 [00:02<00:57, 8.45 examples/s]
Tokenizing Prompts (num_proc=24): 8%|ββββββββββββ | 42/503 [00:02<00:25, 17.93 examples/s]
Tokenizing Prompts (num_proc=24): 13%|ββββββββββββββββββ | 63/503 [00:03<00:18, 24.44 examples/s]
Tokenizing Prompts (num_proc=24): 21%|ββββββββββββββββββββββββββββββ | 105/503 [00:03<00:08, 47.25 examples/s]
Tokenizing Prompts (num_proc=24): 25%|ββββββββββββββββββββββββββββββββββββ | 126/503 [00:03<00:06, 54.13 examples/s]
Tokenizing Prompts (num_proc=24): 33%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 168/503 [00:04<00:05, 65.11 examples/s]
Tokenizing Prompts (num_proc=24): 38%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 189/503 [00:04<00:04, 69.18 examples/s]
Tokenizing Prompts (num_proc=24): 42%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 210/503 [00:04<00:04, 72.90 examples/s]
Tokenizing Prompts (num_proc=24): 46%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 231/503 [00:04<00:03, 76.41 examples/s]
Tokenizing Prompts (num_proc=24): 50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 252/503 [00:05<00:03, 78.36 examples/s]
Tokenizing Prompts (num_proc=24): 54%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 273/503 [00:05<00:02, 80.45 examples/s]
Tokenizing Prompts (num_proc=24): 58%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 294/503 [00:05<00:02, 81.21 examples/s]
Tokenizing Prompts (num_proc=24): 63%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 315/503 [00:05<00:02, 82.67 examples/s]
Tokenizing Prompts (num_proc=24): 67%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 336/503 [00:06<00:01, 84.26 examples/s]
Tokenizing Prompts (num_proc=24): 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 378/503 [00:06<00:01, 84.70 examples/s]
Tokenizing Prompts (num_proc=24): 83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 420/503 [00:07<00:00, 96.25 examples/s]
Tokenizing Prompts (num_proc=24): 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 441/503 [00:07<00:00, 96.00 examples/s]
Tokenizing Prompts (num_proc=24): 92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 462/503 [00:07<00:00, 92.73 examples/s]
Tokenizing Prompts (num_proc=24): 96%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 483/503 [00:07<00:00, 98.27 examples/s]
Tokenizing Prompts (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:07<00:00, 96.06 examples/s]
Tokenizing Prompts (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:08<00:00, 61.38 examples/s] |
|
|
[2026-01-03 15:17:31,724] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:284] min_input_len: 141 |
|
|
[2026-01-03 15:17:31,725] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:284] max_input_len: 627 |
|
|
Dropping Long Sequences (>8192) (num_proc=24): 0%| | 0/503 [00:00<?, ? examples/s]
Dropping Long Sequences (>8192) (num_proc=24): 4%|ββββββ | 21/503 [00:00<00:18, 26.29 examples/s]
Dropping Long Sequences (>8192) (num_proc=24): 25%|ββββββββββββββββββββββββββββββββ | 126/503 [00:00<00:02, 180.11 examples/s]
Dropping Long Sequences (>8192) (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:01<00:00, 810.59 examples/s]
Dropping Long Sequences (>8192) (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:01<00:00, 414.08 examples/s] |
|
|
Drop Samples with Zero Trainable Tokens (num_proc=24): 0%| | 0/503 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=24): 4%|βββββ | 21/503 [00:00<00:17, 26.81 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=24): 8%|βββββββββββ | 42/503 [00:00<00:09, 50.73 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:01<00:00, 403.34 examples/s] |
|
|
Add position_id column (Sample Packing) (num_proc=24): 0%| | 0/503 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=24): 4%|βββββ | 21/503 [00:00<00:18, 26.21 examples/s]
Add position_id column (Sample Packing) (num_proc=24): 33%|ββββββββββββββββββββββββββββββββββββββββ | 168/503 [00:00<00:01, 243.49 examples/s]
Add position_id column (Sample Packing) (num_proc=24): 96%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 483/503 [00:01<00:00, 757.40 examples/s]
Add position_id column (Sample Packing) (num_proc=24): 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:01<00:00, 408.06 examples/s] |
|
|
Saving the dataset (0/1 shards): 0%| | 0/503 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:00<00:00, 2868.10 examples/s]
Saving the dataset (1/1 shards): 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:00<00:00, 2868.10 examples/s]
Saving the dataset (1/1 shards): 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 503/503 [00:00<00:00, 1856.18 examples/s] |
|
|
[2026-01-03 15:17:36,239] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 8_887 |
|
|
[2026-01-03 15:17:36,241] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 6_724` |
|
|
[2026-01-03 15:17:36,251] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:38,093] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:38,424] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33158135414123535 |
|
|
[2026-01-03 15:17:38,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:38,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.377777099609375 |
|
|
[2026-01-03 15:17:38,803] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:39,183] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.38022518157958984 |
|
|
[2026-01-03 15:17:39,184] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:39,513] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3299715518951416 |
|
|
[2026-01-03 15:17:39,557] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 1 |
|
|
[2026-01-03 15:17:39,558] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.54241943359375] |
|
|
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: None |
|
|
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 3 |
|
|
[2026-01-03 15:17:39,589] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 150_536 |
|
|
[2026-01-03 15:17:39,600] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 110_596` |
|
|
[2026-01-03 15:17:39,637] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:40,084] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:40,417] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3335990905761719 |
|
|
[2026-01-03 15:17:40,418] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:40,757] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33975672721862793 |
|
|
[2026-01-03 15:17:40,758] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3313770294189453 |
|
|
[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.32965993881225586 |
|
|
[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10] |
|
|
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 2 |
|
|
[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.918798828125] |
|
|
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: 0.92 |
|
|
[2026-01-03 15:17:41,420] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 6 |
|
|
[2026-01-03 15:17:41,420] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:284] Maximum number of steps set at 6 |
|
|
[2026-01-03 15:17:41,527] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:284] loading tokenizer... Qwen/Qwen3-4B-Instruct-2507 |
|
|
[2026-01-03 15:17:42,820] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <|im_end|> |
|
|
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None |
|
|
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <|endoftext|> |
|
|
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None |
|
|
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:284] Loading model |
|
|
[2026-01-03 15:17:42,956] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:284] Patched Trainer.evaluation_loop with nanmean loss calculation |
|
|
[2026-01-03 15:17:42,961] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:284] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation |
|
|
[2026-01-03 15:17:42,961] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:284] Applying multipack dataloader patch for sample packing... |
|
|
model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 32.8kB [00:00, 47.6MB/s] |
|
|
model-00001-of-00003.safetensors: 0%| | 0.00/3.96G [00:00<?, ?B/s]
model-00001-of-00003.safetensors: 0%| | 630k/3.96G [00:00<1:33:08, 708kB/s]
model-00001-of-00003.safetensors: 0%| | 2.15M/3.96G [00:01<35:41, 1.85MB/s]
model-00001-of-00003.safetensors: 3%|βββββ | 136M/3.96G [00:01<00:24, 158MB/s]
model-00001-of-00003.safetensors: 5%|ββββββββ | 203M/3.96G [00:01<00:21, 178MB/s]
model-00001-of-00003.safetensors: 15%|βββββββββββββββββββββββ | 606M/3.96G [00:01<00:05, 641MB/s]
model-00001-of-00003.safetensors: 19%|ββββββββββββββββββββββββββββ | 741M/3.96G [00:02<00:04, 696MB/s]
model-00001-of-00003.safetensors: 22%|βββββββββββββββββββββββββββββββββ | 878M/3.96G [00:02<00:04, 752MB/s]
model-00001-of-00003.safetensors: 26%|ββββββββββββββββββββββββββββββββββββββ | 1.01G/3.96G [00:02<00:03, 747MB/s]
model-00001-of-00003.safetensors: 29%|βββββββββββββββββββββββββββββββββββββββββββ | 1.15G/3.96G [00:02<00:03, 789MB/s]
model-00001-of-00003.safetensors: 32%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 1.28G/3.96G [00:02<00:03, 842MB/s]
model-00001-of-00003.safetensors: 36%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.41G/3.96G [00:02<00:02, 894MB/s]
model-00001-of-00003.safetensors: 39%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.55G/3.96G [00:02<00:02, 917MB/s]
model-00001-of-00003.safetensors: 43%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.68G/3.96G [00:03<00:02, 933MB/s]
model-00001-of-00003.safetensors: 46%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.82G/3.96G [00:03<00:02, 937MB/s]
model-00001-of-00003.safetensors: 49%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.95G/3.96G [00:03<00:02, 768MB/s]
model-00001-of-00003.safetensors: 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.22G/3.96G [00:03<00:01, 1.07GB/s]
model-00001-of-00003.safetensors: 59%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.35G/3.96G [00:03<00:01, 1.06GB/s]
model-00001-of-00003.safetensors: 63%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.49G/3.96G [00:03<00:01, 1.04GB/s]
model-00001-of-00003.safetensors: 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.62G/3.96G [00:04<00:01, 1.04GB/s]
model-00001-of-00003.safetensors: 70%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.75G/3.96G [00:04<00:01, 1.03GB/s]
model-00001-of-00003.safetensors: 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.89G/3.96G [00:04<00:01, 1.00GB/s]
model-00001-of-00003.safetensors: 76%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.02G/3.96G [00:04<00:00, 1.02GB/s]
model-00001-of-00003.safetensors: 80%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.16G/3.96G [00:04<00:00, 1.03GB/s]
model-00001-of-00003.safetensors: 83%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.29G/3.96G [00:04<00:00, 1.04GB/s]
model-00001-of-00003.safetensors: 86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.42G/3.96G [00:04<00:00, 1.05GB/s]
model-00001-of-00003.safetensors: 90%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.56G/3.96G [00:04<00:00, 1.06GB/s]
model-00001-of-00003.safetensors: 93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.69G/3.96G [00:05<00:00, 1.02GB/s]
model-00001-of-00003.safetensors: 97%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.82G/3.96G [00:05<00:00, 1.02GB/s]
model-00001-of-00003.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.96G/3.96G [00:05<00:00, 1.02GB/s]
model-00001-of-00003.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.96G/3.96G [00:05<00:00, 746MB/s] |
|
|
model-00002-of-00003.safetensors: 0%| | 0.00/3.99G [00:00<?, ?B/s]
model-00002-of-00003.safetensors: 0%| | 880k/3.99G [00:00<1:04:55, 1.02MB/s]
model-00002-of-00003.safetensors: 0%| | 2.56M/3.99G [00:01<27:15, 2.44MB/s]
model-00002-of-00003.safetensors: 2%|βββ | 69.6M/3.99G [00:01<00:46, 84.8MB/s]
model-00002-of-00003.safetensors: 5%|ββββββββ | 204M/3.99G [00:01<00:20, 183MB/s]
model-00002-of-00003.safetensors: 14%|ββββββββββββββββββββ | 542M/3.99G [00:01<00:06, 555MB/s]
model-00002-of-00003.safetensors: 17%|βββββββββββββββββββββββββ | 676M/3.99G [00:02<00:05, 648MB/s]
model-00002-of-00003.safetensors: 20%|ββββββββββββββββββββββββββββββ | 810M/3.99G [00:02<00:04, 733MB/s]
model-00002-of-00003.safetensors: 24%|βββββββββββββββββββββββββββββββββββ | 943M/3.99G [00:02<00:04, 730MB/s]
model-00002-of-00003.safetensors: 29%|βββββββββββββββββββββββββββββββββββββββββββ | 1.14G/3.99G [00:02<00:03, 876MB/s]
model-00002-of-00003.safetensors: 32%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 1.28G/3.99G [00:02<00:03, 884MB/s]
model-00002-of-00003.safetensors: 35%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.41G/3.99G [00:02<00:02, 921MB/s]
model-00002-of-00003.safetensors: 40%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.58G/3.99G [00:02<00:02, 949MB/s]
model-00002-of-00003.safetensors: 43%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.71G/3.99G [00:03<00:02, 935MB/s]
model-00002-of-00003.safetensors: 46%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.85G/3.99G [00:03<00:02, 943MB/s]
model-00002-of-00003.safetensors: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1.98G/3.99G [00:03<00:02, 982MB/s]
model-00002-of-00003.safetensors: 53%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.11G/3.99G [00:03<00:01, 996MB/s]
model-00002-of-00003.safetensors: 56%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.25G/3.99G [00:03<00:01, 980MB/s]
model-00002-of-00003.safetensors: 60%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.38G/3.99G [00:03<00:01, 1.00GB/s]
model-00002-of-00003.safetensors: 63%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.52G/3.99G [00:03<00:01, 1.00GB/s]
model-00002-of-00003.safetensors: 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.65G/3.99G [00:04<00:01, 1.01GB/s]
model-00002-of-00003.safetensors: 70%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.78G/3.99G [00:04<00:01, 957MB/s]
model-00002-of-00003.safetensors: 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2.92G/3.99G [00:04<00:01, 1.02GB/s]
model-00002-of-00003.safetensors: 76%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.05G/3.99G [00:04<00:00, 1.01GB/s]
model-00002-of-00003.safetensors: 80%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.18G/3.99G [00:04<00:00, 1.02GB/s]
model-00002-of-00003.safetensors: 83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.32G/3.99G [00:04<00:00, 993MB/s]
model-00002-of-00003.safetensors: 87%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.45G/3.99G [00:04<00:00, 984MB/s]
model-00002-of-00003.safetensors: 90%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.59G/3.99G [00:04<00:00, 1.00GB/s]
model-00002-of-00003.safetensors: 93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.72G/3.99G [00:05<00:00, 991MB/s]
model-00002-of-00003.safetensors: 97%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3.85G/3.99G [00:05<00:00, 988MB/s]
model-00002-of-00003.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.99G/3.99G [00:05<00:00, 1.01GB/s]
model-00002-of-00003.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.99G/3.99G [00:05<00:00, 743MB/s] |
|
|
model-00003-of-00003.safetensors: 0%| | 0.00/99.6M [00:00<?, ?B/s]
model-00003-of-00003.safetensors: 33%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 32.6M/99.6M [00:00<00:01, 49.6MB/s]
model-00003-of-00003.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 99.6M/99.6M [00:00<00:00, 114MB/s]
model-00003-of-00003.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 99.6M/99.6M [00:00<00:00, 101MB/s] |
|
|
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 67%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2/3 [00:00<00:00, 20.00it/s]
Loading checkpoint shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3/3 [00:00<00:00, 29.22it/s] |
|
|
generation_config.json: 0%| | 0.00/238 [00:00<?, ?B/s]
generation_config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 238/238 [00:00<00:00, 938kB/s] |
|
|
[2026-01-03 15:17:57,914] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:284] Converting modules to torch.bfloat16 |
|
|
[2026-01-03 15:17:59,169] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:284] Memory usage after model load 0.000GB () |
|
|
[2026-01-03 15:18:21,194] [INFO] [axolotl.train.save_initial_configs:417] [PID:284] Pre-saving tokenizer to ./tieto-code-mini-4b-instruct... |
|
|
[2026-01-03 15:18:21,607] [INFO] [axolotl.train.save_initial_configs:422] [PID:284] Pre-saving model config to ./tieto-code-mini-4b-instruct... |
|
|
[2026-01-03 15:18:21,614] [INFO] [axolotl.train.execute_training:212] [PID:284] Starting trainer... |
|
|
[2026-01-03 15:18:24,659] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.132145643234253 |
|
|
[2026-01-03 15:18:25,818] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1583445072174072 |
|
|
[2026-01-03 15:18:26,866] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0472462177276611 |
|
|
[2026-01-03 15:18:27,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0393366813659668 |
|
|
[2026-01-03 15:18:27,906] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10] |
|
|
0%| | 0/6 [00:00<?, ?it/s][2026-01-03 15:18:28,025] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:18:30,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0382018089294434 |
|
|
[2026-01-03 15:18:31,265] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1572446823120117 |
|
|
[2026-01-03 15:18:32,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0719947814941406 |
|
|
[2026-01-03 15:18:33,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0679423809051514 |
|
|
[2026-01-03 15:18:33,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 3.1398396492004395, 'eval_runtime': 2.9112, 'eval_samples_per_second': 8.931, 'eval_steps_per_second': 4.465, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 30.84, 'memory/max_allocated (GiB)': 30.84, 'memory/device_reserved (GiB)': 32.58, 'epoch': 0} |
|
|
0%| | 0/6 [00:08<?, ?it/s] |
|
|
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 116.42it/s][A |
|
|
[A
17%|ββββββββββββββββββββββββββββββββ | 1/6 [00:18<01:31, 18.36s/it]
{'loss': 3.1865, 'grad_norm': 37.0, 'learning_rate': 0.0, 'ppl': 24.2036, 'memory/max_active (GiB)': 46.07, 'memory/max_allocated (GiB)': 46.07, 'memory/device_reserved (GiB)': 51.24, 'tokens_per_second_per_gpu': 8211.09, 'total_tokens': 54696, 'epoch': 0.4} |
|
|
17%|ββββββββββββββββββββββββββββββββ | 1/6 [00:18<01:31, 18.36s/it][2026-01-03 15:18:46,390] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:18:48,760] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.101508617401123 |
|
|
[2026-01-03 15:18:49,874] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1127257347106934 |
|
|
[2026-01-03 15:18:50,996] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1217362880706787 |
|
|
[2026-01-03 15:18:52,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1201398372650146 |
|
|
[2026-01-03 15:18:52,117] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 3.1398396492004395, 'eval_runtime': 1.88, 'eval_samples_per_second': 13.83, 'eval_steps_per_second': 6.915, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 51.24, 'epoch': 0.4} |
|
|
17%|ββββββββββββββββββββββββββββββββ | 1/6 [00:25<01:31, 18.36s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 20.14it/s][A |
|
|
[A
33%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2/6 [00:32<01:03, 15.79s/it]
{'loss': 3.2114, 'grad_norm': 37.25, 'learning_rate': 2.0000000000000003e-06, 'ppl': 24.8138, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.51, 'tokens_per_second_per_gpu': 7540.09, 'total_tokens': 109425, 'epoch': 0.8} |
|
|
33%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2/6 [00:32<01:03, 15.79s/it][2026-01-03 15:19:00,384] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:19:02,683] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1525053977966309 |
|
|
[2026-01-03 15:19:03,815] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.131868600845337 |
|
|
[2026-01-03 15:19:04,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.099958896636963 |
|
|
[2026-01-03 15:19:06,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1644210815429688 |
|
|
[2026-01-03 15:19:06,081] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 3.0861029624938965, 'eval_runtime': 1.9265, 'eval_samples_per_second': 13.496, 'eval_steps_per_second': 6.748, 'eval_ppl': 21.8916, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.51, 'epoch': 0.8} |
|
|
33%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 2/6 [00:39<01:03, 15.79s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 20.09it/s][A |
|
|
[A[2026-01-03 15:19:08,022] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-2 |
|
|
50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3/6 [01:44<02:04, 41.57s/it]
{'loss': 3.1362, 'grad_norm': 35.5, 'learning_rate': 4.000000000000001e-06, 'ppl': 23.0162, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5555.06, 'total_tokens': 130768, 'epoch': 1.0} |
|
|
50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3/6 [01:44<02:04, 41.57s/it][2026-01-03 15:20:12,627] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:20:15,379] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.314366102218628 |
|
|
[2026-01-03 15:20:16,662] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2827715873718262 |
|
|
[2026-01-03 15:20:17,967] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.304215431213379 |
|
|
[2026-01-03 15:20:19,242] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2750258445739746 |
|
|
[2026-01-03 15:20:19,243] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 2.908294916152954, 'eval_runtime': 2.149, 'eval_samples_per_second': 12.099, 'eval_steps_per_second': 6.049, 'eval_ppl': 18.3255, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.0} |
|
|
50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3/6 [01:53<02:04, 41.57s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 20.02it/s][A |
|
|
[A
67%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 4/6 [02:03<01:05, 32.65s/it]
{'loss': 2.9691, 'grad_norm': 31.125, 'learning_rate': 6e-06, 'ppl': 19.4744, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7579.19, 'total_tokens': 185778, 'epoch': 1.4} |
|
|
67%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 4/6 [02:03<01:05, 32.65s/it][2026-01-03 15:20:31,608] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:20:34,397] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3423559665679932 |
|
|
[2026-01-03 15:20:35,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5087320804595947 |
|
|
[2026-01-03 15:20:37,385] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.478161096572876 |
|
|
[2026-01-03 15:20:38,712] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3267621994018555 |
|
|
[2026-01-03 15:20:38,712] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 2.72947359085083, 'eval_runtime': 2.1836, 'eval_samples_per_second': 11.907, 'eval_steps_per_second': 5.953, 'eval_ppl': 15.3248, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.4} |
|
|
67%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 4/6 [02:12<01:05, 32.65s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 20.22it/s][A |
|
|
[A[2026-01-03 15:20:40,912] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-4 |
|
|
83%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 5/6 [03:19<00:48, 48.24s/it]
{'loss': 2.7408, 'grad_norm': 22.125, 'learning_rate': 8.000000000000001e-06, 'ppl': 15.4994, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7537.94, 'total_tokens': 240377, 'epoch': 1.8} |
|
|
83%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 5/6 [03:19<00:48, 48.24s/it][2026-01-03 15:21:47,499] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:21:50,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5038812160491943 |
|
|
[2026-01-03 15:21:51,792] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.288691759109497 |
|
|
[2026-01-03 15:21:53,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2864303588867188 |
|
|
[2026-01-03 15:21:54,394] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3142154216766357 |
|
|
[2026-01-03 15:21:54,394] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 2.5420169830322266, 'eval_runtime': 2.3069, 'eval_samples_per_second': 11.27, 'eval_steps_per_second': 5.635, 'eval_ppl': 12.7053, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.8} |
|
|
83%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 5/6 [03:28<00:48, 48.24s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 20.04it/s][A |
|
|
[A
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [03:31<00:00, 35.95s/it]
{'loss': 2.5747, 'grad_norm': 11.25, 'learning_rate': 1e-05, 'ppl': 13.1274, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5517.82, 'total_tokens': 261536, 'epoch': 2.0} |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [03:31<00:00, 35.95s/it][2026-01-03 15:21:59,590] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step... |
|
|
[2026-01-03 15:22:02,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2924823760986328 |
|
|
[2026-01-03 15:22:03,963] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.4604885578155518 |
|
|
[2026-01-03 15:22:05,262] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2980804443359375 |
|
|
[2026-01-03 15:22:06,512] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2491240501403809 |
|
|
[2026-01-03 15:22:06,512] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1] |
|
|
|
|
|
0%| | 0/1 [00:00<?, ?it/s][A
|
|
|
[A{'eval_loss': 2.455676555633545, 'eval_runtime': 2.0695, 'eval_samples_per_second': 12.563, 'eval_steps_per_second': 6.282, 'eval_ppl': 11.6543, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0} |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [03:40<00:00, 35.95s/it] |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 19.90it/s][A |
|
|
[A[2026-01-03 15:22:08,596] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-6 |
|
|
{'train_runtime': 279.4175, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.021, 'train_loss': 2.9697999954223633, 'memory/max_active (GiB)': 17.39, 'memory/max_allocated (GiB)': 17.39, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0} |
|
|
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [04:39<00:00, 35.95s/it]
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [04:39<00:00, 46.57s/it] |
|
|
[2026-01-03 15:23:07,487] [INFO] [axolotl.train.save_trained_model:233] [PID:284] Training completed! Saving trained model to ./tieto-code-mini-4b-instruct. |
|
|
[2026-01-03 15:23:37,492] [INFO] [axolotl.train.save_trained_model:351] [PID:284] Model successfully saved to ./tieto-code-mini-4b-instruct |
|
|
[0m |