Upload 11 files

2763379 verified about 1 month ago

88.8 kB

	[2026-01-03 15:17:19,855] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:284] bf16 support detected, enabling for this configuration.
	config.json: 0%\| \| 0.00/727 [00:00<?, ?B/s] config.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 727/727 [00:00<00:00, 1.71MB/s]
	[2026-01-03 15:17:20,070] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:284] baseline 0.000GB ()
	[2026-01-03 15:17:20,071] [INFO] [axolotl.cli.config.load_cfg:256] [PID:284] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "config.yaml",
	"base_model": "Qwen/Qwen3-4B-Instruct-2507",
	"base_model_config": "Qwen/Qwen3-4B-Instruct-2507",
	"batch_size": 8,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_90",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 24,
	"dataset_prepared_path": "last_run_prepared",
	"datasets": [
	{
	"chat_template": "chatml",
	"field_messages": "messages",
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "data.jsonl",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.8.0"
	},
	"eval_batch_size": 2,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_sample_packing": true,
	"eval_steps": 0.08333333333333333,
	"eval_table_size": 0,
	"evals_per_epoch": 4,
	"experimental_skip_move_to_device": true,
	"flash_attention": true,
	"fp16": false,
	"gradient_accumulation_steps": 4,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": true
	},
	"group_by_length": false,
	"include_tkps": true,
	"is_falcon_derived_model": false,
	"is_llama_derived_model": false,
	"is_mistral_derived_model": false,
	"learning_rate": 2e-05,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"mean_resizing_embeddings": false,
	"micro_batch_size": 2,
	"model_config_type": "qwen3",
	"num_epochs": 3.0,
	"optimizer": "adamw_bnb_8bit",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./tieto-code-mini-4b-instruct",
	"pad_to_sequence_len": true,
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": true,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_steps": 0.3333333333333333,
	"saves_per_epoch": 1,
	"sequence_len": 8192,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": false,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507",
	"tokenizer_save_jinja_files": true,
	"tokenizer_type": "AutoTokenizer",
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"trust_remote_code": true,
	"type_of_model": "AutoModelForCausalLM",
	"use_otel_metrics": false,
	"use_ray": false,
	"val_set_size": 0.05,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"warmup_steps": 10,
	"weight_decay": 0.0,
	"world_size": 1
	}
	[2026-01-03 15:17:20,074] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:284] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
	tokenizer_config.json: 0.00B [00:00, ?B/s] tokenizer_config.json: 9.38kB [00:00, 14.1MB/s]
	vocab.json: 0.00B [00:00, ?B/s] vocab.json: 2.78MB [00:00, 46.0MB/s]
	merges.txt: 0.00B [00:00, ?B/s] merges.txt: 1.67MB [00:00, 42.9MB/s]
	tokenizer.json: 0%\| \| 0.00/11.4M [00:00<?, ?B/s] tokenizer.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 11.4M/11.4M [00:00<00:00, 19.7MB/s] tokenizer.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 11.4M/11.4M [00:00<00:00, 19.7MB/s]
	[2026-01-03 15:17:22,709] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <\|im_end\|>
	[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None
	[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <\|endoftext\|>
	[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None
	[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:284] Unable to find prepared dataset in last_run_prepared/90a4bd078072b9d1de83a8db5d6b8671
	[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:284] Loading raw datasets...
	[2026-01-03 15:17:22,714] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:284] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 503 examples [00:00, 22482.50 examples/s]
	[2026-01-03 15:17:23,108] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:284] Loading dataset: data.jsonl with base_type: chat_template and prompt_style: None
	[2026-01-03 15:17:23,136] [INFO] [axolotl.prompt_strategies.chat_template.__call__:996] [PID:284] Using chat template:
	---
	{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '
	' + message['content'] + '<\|im_end\|>' + '
	'}}{% endfor %}{% if add_generation_prompt %}{{ '<\|im_start\|>assistant
	' }}{% endif %}

	---
	Tokenizing Prompts (num_proc=24): 0%\| \| 0/503 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=24): 4%\|█████▉ \| 21/503 [00:02<00:57, 8.45 examples/s] Tokenizing Prompts (num_proc=24): 8%\|███████████▉ \| 42/503 [00:02<00:25, 17.93 examples/s] Tokenizing Prompts (num_proc=24): 13%\|█████████████████▉ \| 63/503 [00:03<00:18, 24.44 examples/s] Tokenizing Prompts (num_proc=24): 21%\|█████████████████████████████▋ \| 105/503 [00:03<00:08, 47.25 examples/s] Tokenizing Prompts (num_proc=24): 25%\|███████████████████████████████████▌ \| 126/503 [00:03<00:06, 54.13 examples/s] Tokenizing Prompts (num_proc=24): 33%\|███████████████████████████████████████████████▍ \| 168/503 [00:04<00:05, 65.11 examples/s] Tokenizing Prompts (num_proc=24): 38%\|█████████████████████████████████████████████████████▎ \| 189/503 [00:04<00:04, 69.18 examples/s] Tokenizing Prompts (num_proc=24): 42%\|███████████████████████████████████████████████████████████▎ \| 210/503 [00:04<00:04, 72.90 examples/s] Tokenizing Prompts (num_proc=24): 46%\|█████████████████████████████████████████████████████████████████▏ \| 231/503 [00:04<00:03, 76.41 examples/s] Tokenizing Prompts (num_proc=24): 50%\|███████████████████████████████████████████████████████████████████████▏ \| 252/503 [00:05<00:03, 78.36 examples/s] Tokenizing Prompts (num_proc=24): 54%\|█████████████████████████████████████████████████████████████████████████████ \| 273/503 [00:05<00:02, 80.45 examples/s] Tokenizing Prompts (num_proc=24): 58%\|██████████████████████████████████████████████████████████████████████████████████▉ \| 294/503 [00:05<00:02, 81.21 examples/s] Tokenizing Prompts (num_proc=24): 63%\|████████████████████████████████████████████████████████████████████████████████████████▉ \| 315/503 [00:05<00:02, 82.67 examples/s] Tokenizing Prompts (num_proc=24): 67%\|██████████████████████████████████████████████████████████████████████████████████████████████▊ \| 336/503 [00:06<00:01, 84.26 examples/s] Tokenizing Prompts (num_proc=24): 75%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 378/503 [00:06<00:01, 84.70 examples/s] Tokenizing Prompts (num_proc=24): 83%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 420/503 [00:07<00:00, 96.25 examples/s] Tokenizing Prompts (num_proc=24): 88%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 441/503 [00:07<00:00, 96.00 examples/s] Tokenizing Prompts (num_proc=24): 92%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 462/503 [00:07<00:00, 92.73 examples/s] Tokenizing Prompts (num_proc=24): 96%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 483/503 [00:07<00:00, 98.27 examples/s] Tokenizing Prompts (num_proc=24): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:07<00:00, 96.06 examples/s] Tokenizing Prompts (num_proc=24): 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:08<00:00, 61.38 examples/s]
	[2026-01-03 15:17:31,724] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:284] min_input_len: 141
	[2026-01-03 15:17:31,725] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:284] max_input_len: 627
	Dropping Long Sequences (>8192) (num_proc=24): 0%\| \| 0/503 [00:00<?, ? examples/s] Dropping Long Sequences (>8192) (num_proc=24): 4%\|█████▍ \| 21/503 [00:00<00:18, 26.29 examples/s] Dropping Long Sequences (>8192) (num_proc=24): 25%\|████████████████████████████████ \| 126/503 [00:00<00:02, 180.11 examples/s] Dropping Long Sequences (>8192) (num_proc=24): 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:01<00:00, 810.59 examples/s] Dropping Long Sequences (>8192) (num_proc=24): 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:01<00:00, 414.08 examples/s]
	Drop Samples with Zero Trainable Tokens (num_proc=24): 0%\| \| 0/503 [00:00<?, ? examples/s] Drop Samples with Zero Trainable Tokens (num_proc=24): 4%\|█████ \| 21/503 [00:00<00:17, 26.81 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=24): 8%\|██████████▏ \| 42/503 [00:00<00:09, 50.73 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=24): 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:01<00:00, 403.34 examples/s]
	Add position_id column (Sample Packing) (num_proc=24): 0%\| \| 0/503 [00:00<?, ? examples/s] Add position_id column (Sample Packing) (num_proc=24): 4%\|█████ \| 21/503 [00:00<00:18, 26.21 examples/s] Add position_id column (Sample Packing) (num_proc=24): 33%\|████████████████████████████████████████ \| 168/503 [00:00<00:01, 243.49 examples/s] Add position_id column (Sample Packing) (num_proc=24): 96%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 483/503 [00:01<00:00, 757.40 examples/s] Add position_id column (Sample Packing) (num_proc=24): 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:01<00:00, 408.06 examples/s]
	Saving the dataset (0/1 shards): 0%\| \| 0/503 [00:00<?, ? examples/s] Saving the dataset (0/1 shards): 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:00<00:00, 2868.10 examples/s] Saving the dataset (1/1 shards): 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:00<00:00, 2868.10 examples/s] Saving the dataset (1/1 shards): 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 503/503 [00:00<00:00, 1856.18 examples/s]
	[2026-01-03 15:17:36,239] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 8_887
	[2026-01-03 15:17:36,241] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 6_724`
	[2026-01-03 15:17:36,251] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:38,093] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:38,424] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33158135414123535
	[2026-01-03 15:17:38,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:38,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.377777099609375
	[2026-01-03 15:17:38,803] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:39,183] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.38022518157958984
	[2026-01-03 15:17:39,184] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:39,513] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3299715518951416
	[2026-01-03 15:17:39,557] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]
	[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 1
	[2026-01-03 15:17:39,558] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.54241943359375]
	[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: None
	[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 3
	[2026-01-03 15:17:39,589] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 150_536
	[2026-01-03 15:17:39,600] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 110_596`
	[2026-01-03 15:17:39,637] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:40,084] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:40,417] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3335990905761719
	[2026-01-03 15:17:40,418] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:40,757] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33975672721862793
	[2026-01-03 15:17:40,758] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3313770294189453
	[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
	[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.32965993881225586
	[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10]
	[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 2
	[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.918798828125]
	[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: 0.92
	[2026-01-03 15:17:41,420] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 6
	[2026-01-03 15:17:41,420] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:284] Maximum number of steps set at 6
	[2026-01-03 15:17:41,527] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:284] loading tokenizer... Qwen/Qwen3-4B-Instruct-2507
	[2026-01-03 15:17:42,820] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <\|im_end\|>
	[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None
	[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <\|endoftext\|>
	[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None
	[2026-01-03 15:17:42,821] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:284] Loading model
	[2026-01-03 15:17:42,956] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:284] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-01-03 15:17:42,961] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:284] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-01-03 15:17:42,961] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:284] Applying multipack dataloader patch for sample packing...
	model.safetensors.index.json: 0.00B [00:00, ?B/s] model.safetensors.index.json: 32.8kB [00:00, 47.6MB/s]
	model-00001-of-00003.safetensors: 0%\| \| 0.00/3.96G [00:00<?, ?B/s] model-00001-of-00003.safetensors: 0%\| \| 630k/3.96G [00:00<1:33:08, 708kB/s] model-00001-of-00003.safetensors: 0%\| \| 2.15M/3.96G [00:01<35:41, 1.85MB/s] model-00001-of-00003.safetensors: 3%\|█████ \| 136M/3.96G [00:01<00:24, 158MB/s] model-00001-of-00003.safetensors: 5%\|███████▌ \| 203M/3.96G [00:01<00:21, 178MB/s] model-00001-of-00003.safetensors: 15%\|██████████████████████▋ \| 606M/3.96G [00:01<00:05, 641MB/s] model-00001-of-00003.safetensors: 19%\|███████████████████████████▋ \| 741M/3.96G [00:02<00:04, 696MB/s] model-00001-of-00003.safetensors: 22%\|████████████████████████████████▊ \| 878M/3.96G [00:02<00:04, 752MB/s] model-00001-of-00003.safetensors: 26%\|█████████████████████████████████████▌ \| 1.01G/3.96G [00:02<00:03, 747MB/s] model-00001-of-00003.safetensors: 29%\|██████████████████████████████████████████▌ \| 1.15G/3.96G [00:02<00:03, 789MB/s] model-00001-of-00003.safetensors: 32%\|███████████████████████████████████████████████▌ \| 1.28G/3.96G [00:02<00:03, 842MB/s] model-00001-of-00003.safetensors: 36%\|████████████████████████████████████████████████████▌ \| 1.41G/3.96G [00:02<00:02, 894MB/s] model-00001-of-00003.safetensors: 39%\|█████████████████████████████████████████████████████████▌ \| 1.55G/3.96G [00:02<00:02, 917MB/s] model-00001-of-00003.safetensors: 43%\|██████████████████████████████████████████████████████████████▍ \| 1.68G/3.96G [00:03<00:02, 933MB/s] model-00001-of-00003.safetensors: 46%\|███████████████████████████████████████████████████████████████████▍ \| 1.82G/3.96G [00:03<00:02, 937MB/s] model-00001-of-00003.safetensors: 49%\|████████████████████████████████████████████████████████████████████████▍ \| 1.95G/3.96G [00:03<00:02, 768MB/s] model-00001-of-00003.safetensors: 56%\|█████████████████████████████████████████████████████████████████████████████████▊ \| 2.22G/3.96G [00:03<00:01, 1.07GB/s] model-00001-of-00003.safetensors: 59%\|██████████████████████████████████████████████████████████████████████████████████████▊ \| 2.35G/3.96G [00:03<00:01, 1.06GB/s] model-00001-of-00003.safetensors: 63%\|███████████████████████████████████████████████████████████████████████████████████████████▋ \| 2.49G/3.96G [00:03<00:01, 1.04GB/s] model-00001-of-00003.safetensors: 66%\|████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 2.62G/3.96G [00:04<00:01, 1.04GB/s] model-00001-of-00003.safetensors: 70%\|█████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 2.75G/3.96G [00:04<00:01, 1.03GB/s] model-00001-of-00003.safetensors: 73%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 2.89G/3.96G [00:04<00:01, 1.00GB/s] model-00001-of-00003.safetensors: 76%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 3.02G/3.96G [00:04<00:00, 1.02GB/s] model-00001-of-00003.safetensors: 80%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 3.16G/3.96G [00:04<00:00, 1.03GB/s] model-00001-of-00003.safetensors: 83%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 3.29G/3.96G [00:04<00:00, 1.04GB/s] model-00001-of-00003.safetensors: 86%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 3.42G/3.96G [00:04<00:00, 1.05GB/s] model-00001-of-00003.safetensors: 90%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 3.56G/3.96G [00:04<00:00, 1.06GB/s] model-00001-of-00003.safetensors: 93%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 3.69G/3.96G [00:05<00:00, 1.02GB/s] model-00001-of-00003.safetensors: 97%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 3.82G/3.96G [00:05<00:00, 1.02GB/s] model-00001-of-00003.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3.96G/3.96G [00:05<00:00, 1.02GB/s] model-00001-of-00003.safetensors: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3.96G/3.96G [00:05<00:00, 746MB/s]
	model-00002-of-00003.safetensors: 0%\| \| 0.00/3.99G [00:00<?, ?B/s] model-00002-of-00003.safetensors: 0%\| \| 880k/3.99G [00:00<1:04:55, 1.02MB/s] model-00002-of-00003.safetensors: 0%\| \| 2.56M/3.99G [00:01<27:15, 2.44MB/s] model-00002-of-00003.safetensors: 2%\|██▌ \| 69.6M/3.99G [00:01<00:46, 84.8MB/s] model-00002-of-00003.safetensors: 5%\|███████▌ \| 204M/3.99G [00:01<00:20, 183MB/s] model-00002-of-00003.safetensors: 14%\|████████████████████ \| 542M/3.99G [00:01<00:06, 555MB/s] model-00002-of-00003.safetensors: 17%\|█████████████████████████ \| 676M/3.99G [00:02<00:05, 648MB/s] model-00002-of-00003.safetensors: 20%\|██████████████████████████████ \| 810M/3.99G [00:02<00:04, 733MB/s] model-00002-of-00003.safetensors: 24%\|███████████████████████████████████ \| 943M/3.99G [00:02<00:04, 730MB/s] model-00002-of-00003.safetensors: 29%\|██████████████████████████████████████████▏ \| 1.14G/3.99G [00:02<00:03, 876MB/s] model-00002-of-00003.safetensors: 32%\|███████████████████████████████████████████████▏ \| 1.28G/3.99G [00:02<00:03, 884MB/s] model-00002-of-00003.safetensors: 35%\|████████████████████████████████████████████████████ \| 1.41G/3.99G [00:02<00:02, 921MB/s] model-00002-of-00003.safetensors: 40%\|██████████████████████████████████████████████████████████▏ \| 1.58G/3.99G [00:02<00:02, 949MB/s] model-00002-of-00003.safetensors: 43%\|███████████████████████████████████████████████████████████████ \| 1.71G/3.99G [00:03<00:02, 935MB/s] model-00002-of-00003.safetensors: 46%\|████████████████████████████████████████████████████████████████████ \| 1.85G/3.99G [00:03<00:02, 943MB/s] model-00002-of-00003.safetensors: 50%\|████████████████████████████████████████████████████████████████████████▉ \| 1.98G/3.99G [00:03<00:02, 982MB/s] model-00002-of-00003.safetensors: 53%\|█████████████████████████████████████████████████████████████████████████████▉ \| 2.11G/3.99G [00:03<00:01, 996MB/s] model-00002-of-00003.safetensors: 56%\|██████████████████████████████████████████████████████████████████████████████████▉ \| 2.25G/3.99G [00:03<00:01, 980MB/s] model-00002-of-00003.safetensors: 60%\|███████████████████████████████████████████████████████████████████████████████████████▏ \| 2.38G/3.99G [00:03<00:01, 1.00GB/s] model-00002-of-00003.safetensors: 63%\|████████████████████████████████████████████████████████████████████████████████████████████▏ \| 2.52G/3.99G [00:03<00:01, 1.00GB/s] model-00002-of-00003.safetensors: 66%\|████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 2.65G/3.99G [00:04<00:01, 1.01GB/s] model-00002-of-00003.safetensors: 70%\|██████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 2.78G/3.99G [00:04<00:01, 957MB/s] model-00002-of-00003.safetensors: 73%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 2.92G/3.99G [00:04<00:01, 1.02GB/s] model-00002-of-00003.safetensors: 76%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 3.05G/3.99G [00:04<00:00, 1.01GB/s] model-00002-of-00003.safetensors: 80%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 3.18G/3.99G [00:04<00:00, 1.02GB/s] model-00002-of-00003.safetensors: 83%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 3.32G/3.99G [00:04<00:00, 993MB/s] model-00002-of-00003.safetensors: 87%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 3.45G/3.99G [00:04<00:00, 984MB/s] model-00002-of-00003.safetensors: 90%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 3.59G/3.99G [00:04<00:00, 1.00GB/s] model-00002-of-00003.safetensors: 93%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 3.72G/3.99G [00:05<00:00, 991MB/s] model-00002-of-00003.safetensors: 97%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 3.85G/3.99G [00:05<00:00, 988MB/s] model-00002-of-00003.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3.99G/3.99G [00:05<00:00, 1.01GB/s] model-00002-of-00003.safetensors: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3.99G/3.99G [00:05<00:00, 743MB/s]
	model-00003-of-00003.safetensors: 0%\| \| 0.00/99.6M [00:00<?, ?B/s] model-00003-of-00003.safetensors: 33%\|███████████████████████████████████████████████▋ \| 32.6M/99.6M [00:00<00:01, 49.6MB/s] model-00003-of-00003.safetensors: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 99.6M/99.6M [00:00<00:00, 114MB/s] model-00003-of-00003.safetensors: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 99.6M/99.6M [00:00<00:00, 101MB/s]
	Loading checkpoint shards: 0%\| \| 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 67%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 2/3 [00:00<00:00, 20.00it/s] Loading checkpoint shards: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3/3 [00:00<00:00, 29.22it/s]
	generation_config.json: 0%\| \| 0.00/238 [00:00<?, ?B/s] generation_config.json: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 238/238 [00:00<00:00, 938kB/s]
	[2026-01-03 15:17:57,914] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:284] Converting modules to torch.bfloat16
	[2026-01-03 15:17:59,169] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:284] Memory usage after model load 0.000GB ()
	[2026-01-03 15:18:21,194] [INFO] [axolotl.train.save_initial_configs:417] [PID:284] Pre-saving tokenizer to ./tieto-code-mini-4b-instruct...
	[2026-01-03 15:18:21,607] [INFO] [axolotl.train.save_initial_configs:422] [PID:284] Pre-saving model config to ./tieto-code-mini-4b-instruct...
	[2026-01-03 15:18:21,614] [INFO] [axolotl.train.execute_training:212] [PID:284] Starting trainer...
	[2026-01-03 15:18:24,659] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.132145643234253
	[2026-01-03 15:18:25,818] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1583445072174072
	[2026-01-03 15:18:26,866] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0472462177276611
	[2026-01-03 15:18:27,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0393366813659668
	[2026-01-03 15:18:27,906] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10]
	0%\| \| 0/6 [00:00<?, ?it/s][2026-01-03 15:18:28,025] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:18:30,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0382018089294434
	[2026-01-03 15:18:31,265] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1572446823120117
	[2026-01-03 15:18:32,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0719947814941406
	[2026-01-03 15:18:33,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0679423809051514
	[2026-01-03 15:18:33,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 3.1398396492004395, 'eval_runtime': 2.9112, 'eval_samples_per_second': 8.931, 'eval_steps_per_second': 4.465, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 30.84, 'memory/max_allocated (GiB)': 30.84, 'memory/device_reserved (GiB)': 32.58, 'epoch': 0}
	0%\| \| 0/6 [00:08<?, ?it/s]
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 116.42it/s][A
	[A 17%\|███████████████████████████████▏ \| 1/6 [00:18<01:31, 18.36s/it] {'loss': 3.1865, 'grad_norm': 37.0, 'learning_rate': 0.0, 'ppl': 24.2036, 'memory/max_active (GiB)': 46.07, 'memory/max_allocated (GiB)': 46.07, 'memory/device_reserved (GiB)': 51.24, 'tokens_per_second_per_gpu': 8211.09, 'total_tokens': 54696, 'epoch': 0.4}
	17%\|███████████████████████████████▏ \| 1/6 [00:18<01:31, 18.36s/it][2026-01-03 15:18:46,390] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:18:48,760] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.101508617401123
	[2026-01-03 15:18:49,874] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1127257347106934
	[2026-01-03 15:18:50,996] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1217362880706787
	[2026-01-03 15:18:52,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1201398372650146
	[2026-01-03 15:18:52,117] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 3.1398396492004395, 'eval_runtime': 1.88, 'eval_samples_per_second': 13.83, 'eval_steps_per_second': 6.915, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 51.24, 'epoch': 0.4}
	17%\|███████████████████████████████▏ \| 1/6 [00:25<01:31, 18.36s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 20.14it/s][A
	[A 33%\|██████████████████████████████████████████████████████████████▎ \| 2/6 [00:32<01:03, 15.79s/it] {'loss': 3.2114, 'grad_norm': 37.25, 'learning_rate': 2.0000000000000003e-06, 'ppl': 24.8138, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.51, 'tokens_per_second_per_gpu': 7540.09, 'total_tokens': 109425, 'epoch': 0.8}
	33%\|██████████████████████████████████████████████████████████████▎ \| 2/6 [00:32<01:03, 15.79s/it][2026-01-03 15:19:00,384] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:19:02,683] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1525053977966309
	[2026-01-03 15:19:03,815] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.131868600845337
	[2026-01-03 15:19:04,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.099958896636963
	[2026-01-03 15:19:06,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1644210815429688
	[2026-01-03 15:19:06,081] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 3.0861029624938965, 'eval_runtime': 1.9265, 'eval_samples_per_second': 13.496, 'eval_steps_per_second': 6.748, 'eval_ppl': 21.8916, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.51, 'epoch': 0.8}
	33%\|██████████████████████████████████████████████████████████████▎ \| 2/6 [00:39<01:03, 15.79s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 20.09it/s][A
	[A[2026-01-03 15:19:08,022] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-2
	50%\|█████████████████████████████████████████████████████████████████████████████████████████████▌ \| 3/6 [01:44<02:04, 41.57s/it] {'loss': 3.1362, 'grad_norm': 35.5, 'learning_rate': 4.000000000000001e-06, 'ppl': 23.0162, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5555.06, 'total_tokens': 130768, 'epoch': 1.0}
	50%\|█████████████████████████████████████████████████████████████████████████████████████████████▌ \| 3/6 [01:44<02:04, 41.57s/it][2026-01-03 15:20:12,627] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:20:15,379] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.314366102218628
	[2026-01-03 15:20:16,662] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2827715873718262
	[2026-01-03 15:20:17,967] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.304215431213379
	[2026-01-03 15:20:19,242] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2750258445739746
	[2026-01-03 15:20:19,243] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 2.908294916152954, 'eval_runtime': 2.149, 'eval_samples_per_second': 12.099, 'eval_steps_per_second': 6.049, 'eval_ppl': 18.3255, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.0}
	50%\|█████████████████████████████████████████████████████████████████████████████████████████████▌ \| 3/6 [01:53<02:04, 41.57s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 20.02it/s][A
	[A 67%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 4/6 [02:03<01:05, 32.65s/it] {'loss': 2.9691, 'grad_norm': 31.125, 'learning_rate': 6e-06, 'ppl': 19.4744, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7579.19, 'total_tokens': 185778, 'epoch': 1.4}
	67%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 4/6 [02:03<01:05, 32.65s/it][2026-01-03 15:20:31,608] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:20:34,397] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3423559665679932
	[2026-01-03 15:20:35,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5087320804595947
	[2026-01-03 15:20:37,385] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.478161096572876
	[2026-01-03 15:20:38,712] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3267621994018555
	[2026-01-03 15:20:38,712] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 2.72947359085083, 'eval_runtime': 2.1836, 'eval_samples_per_second': 11.907, 'eval_steps_per_second': 5.953, 'eval_ppl': 15.3248, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.4}
	67%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 4/6 [02:12<01:05, 32.65s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 20.22it/s][A
	[A[2026-01-03 15:20:40,912] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-4
	83%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 5/6 [03:19<00:48, 48.24s/it] {'loss': 2.7408, 'grad_norm': 22.125, 'learning_rate': 8.000000000000001e-06, 'ppl': 15.4994, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7537.94, 'total_tokens': 240377, 'epoch': 1.8}
	83%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 5/6 [03:19<00:48, 48.24s/it][2026-01-03 15:21:47,499] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:21:50,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5038812160491943
	[2026-01-03 15:21:51,792] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.288691759109497
	[2026-01-03 15:21:53,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2864303588867188
	[2026-01-03 15:21:54,394] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3142154216766357
	[2026-01-03 15:21:54,394] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 2.5420169830322266, 'eval_runtime': 2.3069, 'eval_samples_per_second': 11.27, 'eval_steps_per_second': 5.635, 'eval_ppl': 12.7053, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.8}
	83%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 5/6 [03:28<00:48, 48.24s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 20.04it/s][A
	[A 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 6/6 [03:31<00:00, 35.95s/it] {'loss': 2.5747, 'grad_norm': 11.25, 'learning_rate': 1e-05, 'ppl': 13.1274, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5517.82, 'total_tokens': 261536, 'epoch': 2.0}
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 6/6 [03:31<00:00, 35.95s/it][2026-01-03 15:21:59,590] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
	[2026-01-03 15:22:02,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2924823760986328
	[2026-01-03 15:22:03,963] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.4604885578155518
	[2026-01-03 15:22:05,262] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2980804443359375
	[2026-01-03 15:22:06,512] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2491240501403809
	[2026-01-03 15:22:06,512] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

	0%\| \| 0/1 [00:00<?, ?it/s][A
	[A{'eval_loss': 2.455676555633545, 'eval_runtime': 2.0695, 'eval_samples_per_second': 12.563, 'eval_steps_per_second': 6.282, 'eval_ppl': 11.6543, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0}
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 6/6 [03:40<00:00, 35.95s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 19.90it/s][A
	[A[2026-01-03 15:22:08,596] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-6
	{'train_runtime': 279.4175, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.021, 'train_loss': 2.9697999954223633, 'memory/max_active (GiB)': 17.39, 'memory/max_allocated (GiB)': 17.39, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0}
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 6/6 [04:39<00:00, 35.95s/it] 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 6/6 [04:39<00:00, 46.57s/it]
	[2026-01-03 15:23:07,487] [INFO] [axolotl.train.save_trained_model:233] [PID:284] Training completed! Saving trained model to ./tieto-code-mini-4b-instruct.
	[2026-01-03 15:23:37,492] [INFO] [axolotl.train.save_trained_model:351] [PID:284] Model successfully saved to ./tieto-code-mini-4b-instruct
	[0m