[2026-01-03 15:17:19,855] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:284] bf16 support detected, enabling for this configuration.
config.json:   0%|                                                                                                                                                                                  | 0.00/727 [00:00<?, ?B/s]config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 727/727 [00:00<00:00, 1.71MB/s]
[2026-01-03 15:17:20,070] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:284] baseline 0.000GB ()
[2026-01-03 15:17:20,071] [INFO] [axolotl.cli.config.load_cfg:256] [PID:284] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "config.yaml",
  "base_model": "Qwen/Qwen3-4B-Instruct-2507",
  "base_model_config": "Qwen/Qwen3-4B-Instruct-2507",
  "batch_size": 8,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 24,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "chat_template": "chatml",
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "data.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.08333333333333333,
  "eval_table_size": 0,
  "evals_per_epoch": 4,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": false,
  "learning_rate": 2e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "qwen3",
  "num_epochs": 3.0,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./tieto-code-mini-4b-instruct",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.3333333333333333,
  "saves_per_epoch": 1,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": true,
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.05,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_steps": 10,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-01-03 15:17:20,074] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:284] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
tokenizer_config.json: 0.00B [00:00, ?B/s]tokenizer_config.json: 9.38kB [00:00, 14.1MB/s]
vocab.json: 0.00B [00:00, ?B/s]vocab.json: 2.78MB [00:00, 46.0MB/s]
merges.txt: 0.00B [00:00, ?B/s]merges.txt: 1.67MB [00:00, 42.9MB/s]
tokenizer.json:   0%|                                                                                                                                                                             | 0.00/11.4M [00:00<?, ?B/s]tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 19.7MB/s]tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 19.7MB/s]
[2026-01-03 15:17:22,709] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <|im_end|>
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <|endoftext|>
[2026-01-03 15:17:22,710] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None
[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:284] Unable to find prepared dataset in last_run_prepared/90a4bd078072b9d1de83a8db5d6b8671
[2026-01-03 15:17:22,713] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:284] Loading raw datasets...
[2026-01-03 15:17:22,714] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:284] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 503 examples [00:00, 22482.50 examples/s]
[2026-01-03 15:17:23,108] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:284] Loading dataset: data.jsonl with base_type: chat_template and prompt_style: None
[2026-01-03 15:17:23,136] [INFO] [axolotl.prompt_strategies.chat_template.__call__:996] [PID:284] Using chat template:
---
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}

---
Tokenizing Prompts (num_proc=24):   0%|                                                                                                                                                        | 0/503 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=24):   4%|█████▉                                                                                                                                         | 21/503 [00:02<00:57,  8.45 examples/s]Tokenizing Prompts (num_proc=24):   8%|███████████▉                                                                                                                                   | 42/503 [00:02<00:25, 17.93 examples/s]Tokenizing Prompts (num_proc=24):  13%|█████████████████▉                                                                                                                             | 63/503 [00:03<00:18, 24.44 examples/s]Tokenizing Prompts (num_proc=24):  21%|█████████████████████████████▋                                                                                                                | 105/503 [00:03<00:08, 47.25 examples/s]Tokenizing Prompts (num_proc=24):  25%|███████████████████████████████████▌                                                                                                          | 126/503 [00:03<00:06, 54.13 examples/s]Tokenizing Prompts (num_proc=24):  33%|███████████████████████████████████████████████▍                                                                                              | 168/503 [00:04<00:05, 65.11 examples/s]Tokenizing Prompts (num_proc=24):  38%|█████████████████████████████████████████████████████▎                                                                                        | 189/503 [00:04<00:04, 69.18 examples/s]Tokenizing Prompts (num_proc=24):  42%|███████████████████████████████████████████████████████████▎                                                                                  | 210/503 [00:04<00:04, 72.90 examples/s]Tokenizing Prompts (num_proc=24):  46%|█████████████████████████████████████████████████████████████████▏                                                                            | 231/503 [00:04<00:03, 76.41 examples/s]Tokenizing Prompts (num_proc=24):  50%|███████████████████████████████████████████████████████████████████████▏                                                                      | 252/503 [00:05<00:03, 78.36 examples/s]Tokenizing Prompts (num_proc=24):  54%|█████████████████████████████████████████████████████████████████████████████                                                                 | 273/503 [00:05<00:02, 80.45 examples/s]Tokenizing Prompts (num_proc=24):  58%|██████████████████████████████████████████████████████████████████████████████████▉                                                           | 294/503 [00:05<00:02, 81.21 examples/s]Tokenizing Prompts (num_proc=24):  63%|████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 315/503 [00:05<00:02, 82.67 examples/s]Tokenizing Prompts (num_proc=24):  67%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 336/503 [00:06<00:01, 84.26 examples/s]Tokenizing Prompts (num_proc=24):  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 378/503 [00:06<00:01, 84.70 examples/s]Tokenizing Prompts (num_proc=24):  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 420/503 [00:07<00:00, 96.25 examples/s]Tokenizing Prompts (num_proc=24):  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 441/503 [00:07<00:00, 96.00 examples/s]Tokenizing Prompts (num_proc=24):  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 462/503 [00:07<00:00, 92.73 examples/s]Tokenizing Prompts (num_proc=24):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 483/503 [00:07<00:00, 98.27 examples/s]Tokenizing Prompts (num_proc=24): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:07<00:00, 96.06 examples/s]Tokenizing Prompts (num_proc=24): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:08<00:00, 61.38 examples/s]
[2026-01-03 15:17:31,724] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:284] min_input_len: 141
[2026-01-03 15:17:31,725] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:284] max_input_len: 627
Dropping Long Sequences (>8192) (num_proc=24):   0%|                                                                                                                                           | 0/503 [00:00<?, ? examples/s]Dropping Long Sequences (>8192) (num_proc=24):   4%|█████▍                                                                                                                            | 21/503 [00:00<00:18, 26.29 examples/s]Dropping Long Sequences (>8192) (num_proc=24):  25%|████████████████████████████████                                                                                                | 126/503 [00:00<00:02, 180.11 examples/s]Dropping Long Sequences (>8192) (num_proc=24): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:01<00:00, 810.59 examples/s]Dropping Long Sequences (>8192) (num_proc=24): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:01<00:00, 414.08 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=24):   0%|                                                                                                                                   | 0/503 [00:00<?, ? examples/s]Drop Samples with Zero Trainable Tokens (num_proc=24):   4%|█████                                                                                                                     | 21/503 [00:00<00:17, 26.81 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=24):   8%|██████████▏                                                                                                               | 42/503 [00:00<00:09, 50.73 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=24): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:01<00:00, 403.34 examples/s]
Add position_id column (Sample Packing) (num_proc=24):   0%|                                                                                                                                   | 0/503 [00:00<?, ? examples/s]Add position_id column (Sample Packing) (num_proc=24):   4%|█████                                                                                                                     | 21/503 [00:00<00:18, 26.21 examples/s]Add position_id column (Sample Packing) (num_proc=24):  33%|████████████████████████████████████████                                                                                | 168/503 [00:00<00:01, 243.49 examples/s]Add position_id column (Sample Packing) (num_proc=24):  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 483/503 [00:01<00:00, 757.40 examples/s]Add position_id column (Sample Packing) (num_proc=24): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:01<00:00, 408.06 examples/s]
Saving the dataset (0/1 shards):   0%|                                                                                                                                                         | 0/503 [00:00<?, ? examples/s]Saving the dataset (0/1 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:00<00:00, 2868.10 examples/s]Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:00<00:00, 2868.10 examples/s]Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:00<00:00, 1856.18 examples/s]
[2026-01-03 15:17:36,239] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 8_887
[2026-01-03 15:17:36,241] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 6_724`
[2026-01-03 15:17:36,251] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:38,093] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:38,424] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33158135414123535
[2026-01-03 15:17:38,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:38,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.377777099609375
[2026-01-03 15:17:38,803] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:39,183] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.38022518157958984
[2026-01-03 15:17:39,184] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:39,513] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3299715518951416
[2026-01-03 15:17:39,557] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 1
[2026-01-03 15:17:39,558] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.54241943359375]
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: None
[2026-01-03 15:17:39,558] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 3
[2026-01-03 15:17:39,589] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:284] total_num_tokens: 150_536
[2026-01-03 15:17:39,600] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:284] `total_supervised_tokens: 110_596`
[2026-01-03 15:17:39,637] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:40,084] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:40,417] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3335990905761719
[2026-01-03 15:17:40,418] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:40,757] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.33975672721862793
[2026-01-03 15:17:40,758] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.3313770294189453
[2026-01-03 15:17:41,089] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:284] Using single process for pack_parallel, running sequentially.
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 0.32965993881225586
[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10]
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:284] data_loader_len: 2
[2026-01-03 15:17:41,419] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:284] sample_packing_eff_est across ranks: [0.918798828125]
[2026-01-03 15:17:41,419] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:284] sample_packing_eff_est: 0.92
[2026-01-03 15:17:41,420] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:284] total_num_steps: 6
[2026-01-03 15:17:41,420] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:284] Maximum number of steps set at 6
[2026-01-03 15:17:41,527] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:284] loading tokenizer... Qwen/Qwen3-4B-Instruct-2507
[2026-01-03 15:17:42,820] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:284] EOS: 151645 / <|im_end|>
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:284] BOS: None / None
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:284] PAD: 151643 / <|endoftext|>
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:284] UNK: None / None
[2026-01-03 15:17:42,821] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:284] Loading model
[2026-01-03 15:17:42,956] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:284] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-03 15:17:42,961] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:284] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-03 15:17:42,961] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:284] Applying multipack dataloader patch for sample packing...
model.safetensors.index.json: 0.00B [00:00, ?B/s]model.safetensors.index.json: 32.8kB [00:00, 47.6MB/s]
model-00001-of-00003.safetensors:   0%|                                                                                                                                                           | 0.00/3.96G [00:00<?, ?B/s]model-00001-of-00003.safetensors:   0%|                                                                                                                                                  | 630k/3.96G [00:00<1:33:08, 708kB/s]model-00001-of-00003.safetensors:   0%|                                                                                                                                                  | 2.15M/3.96G [00:01<35:41, 1.85MB/s]model-00001-of-00003.safetensors:   3%|█████                                                                                                                                               | 136M/3.96G [00:01<00:24, 158MB/s]model-00001-of-00003.safetensors:   5%|███████▌                                                                                                                                            | 203M/3.96G [00:01<00:21, 178MB/s]model-00001-of-00003.safetensors:  15%|██████████████████████▋                                                                                                                             | 606M/3.96G [00:01<00:05, 641MB/s]model-00001-of-00003.safetensors:  19%|███████████████████████████▋                                                                                                                        | 741M/3.96G [00:02<00:04, 696MB/s]model-00001-of-00003.safetensors:  22%|████████████████████████████████▊                                                                                                                   | 878M/3.96G [00:02<00:04, 752MB/s]model-00001-of-00003.safetensors:  26%|█████████████████████████████████████▌                                                                                                             | 1.01G/3.96G [00:02<00:03, 747MB/s]model-00001-of-00003.safetensors:  29%|██████████████████████████████████████████▌                                                                                                        | 1.15G/3.96G [00:02<00:03, 789MB/s]model-00001-of-00003.safetensors:  32%|███████████████████████████████████████████████▌                                                                                                   | 1.28G/3.96G [00:02<00:03, 842MB/s]model-00001-of-00003.safetensors:  36%|████████████████████████████████████████████████████▌                                                                                              | 1.41G/3.96G [00:02<00:02, 894MB/s]model-00001-of-00003.safetensors:  39%|█████████████████████████████████████████████████████████▌                                                                                         | 1.55G/3.96G [00:02<00:02, 917MB/s]model-00001-of-00003.safetensors:  43%|██████████████████████████████████████████████████████████████▍                                                                                    | 1.68G/3.96G [00:03<00:02, 933MB/s]model-00001-of-00003.safetensors:  46%|███████████████████████████████████████████████████████████████████▍                                                                               | 1.82G/3.96G [00:03<00:02, 937MB/s]model-00001-of-00003.safetensors:  49%|████████████████████████████████████████████████████████████████████████▍                                                                          | 1.95G/3.96G [00:03<00:02, 768MB/s]model-00001-of-00003.safetensors:  56%|█████████████████████████████████████████████████████████████████████████████████▊                                                                | 2.22G/3.96G [00:03<00:01, 1.07GB/s]model-00001-of-00003.safetensors:  59%|██████████████████████████████████████████████████████████████████████████████████████▊                                                           | 2.35G/3.96G [00:03<00:01, 1.06GB/s]model-00001-of-00003.safetensors:  63%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 2.49G/3.96G [00:03<00:01, 1.04GB/s]model-00001-of-00003.safetensors:  66%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 2.62G/3.96G [00:04<00:01, 1.04GB/s]model-00001-of-00003.safetensors:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 2.75G/3.96G [00:04<00:01, 1.03GB/s]model-00001-of-00003.safetensors:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 2.89G/3.96G [00:04<00:01, 1.00GB/s]model-00001-of-00003.safetensors:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 3.02G/3.96G [00:04<00:00, 1.02GB/s]model-00001-of-00003.safetensors:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 3.16G/3.96G [00:04<00:00, 1.03GB/s]model-00001-of-00003.safetensors:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 3.29G/3.96G [00:04<00:00, 1.04GB/s]model-00001-of-00003.safetensors:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 3.42G/3.96G [00:04<00:00, 1.05GB/s]model-00001-of-00003.safetensors:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 3.56G/3.96G [00:04<00:00, 1.06GB/s]model-00001-of-00003.safetensors:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 3.69G/3.96G [00:05<00:00, 1.02GB/s]model-00001-of-00003.safetensors:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 3.82G/3.96G [00:05<00:00, 1.02GB/s]model-00001-of-00003.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.96G/3.96G [00:05<00:00, 1.02GB/s]model-00001-of-00003.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.96G/3.96G [00:05<00:00, 746MB/s]
model-00002-of-00003.safetensors:   0%|                                                                                                                                                           | 0.00/3.99G [00:00<?, ?B/s]model-00002-of-00003.safetensors:   0%|                                                                                                                                                 | 880k/3.99G [00:00<1:04:55, 1.02MB/s]model-00002-of-00003.safetensors:   0%|                                                                                                                                                  | 2.56M/3.99G [00:01<27:15, 2.44MB/s]model-00002-of-00003.safetensors:   2%|██▌                                                                                                                                               | 69.6M/3.99G [00:01<00:46, 84.8MB/s]model-00002-of-00003.safetensors:   5%|███████▌                                                                                                                                            | 204M/3.99G [00:01<00:20, 183MB/s]model-00002-of-00003.safetensors:  14%|████████████████████                                                                                                                                | 542M/3.99G [00:01<00:06, 555MB/s]model-00002-of-00003.safetensors:  17%|█████████████████████████                                                                                                                           | 676M/3.99G [00:02<00:05, 648MB/s]model-00002-of-00003.safetensors:  20%|██████████████████████████████                                                                                                                      | 810M/3.99G [00:02<00:04, 733MB/s]model-00002-of-00003.safetensors:  24%|███████████████████████████████████                                                                                                                 | 943M/3.99G [00:02<00:04, 730MB/s]model-00002-of-00003.safetensors:  29%|██████████████████████████████████████████▏                                                                                                        | 1.14G/3.99G [00:02<00:03, 876MB/s]model-00002-of-00003.safetensors:  32%|███████████████████████████████████████████████▏                                                                                                   | 1.28G/3.99G [00:02<00:03, 884MB/s]model-00002-of-00003.safetensors:  35%|████████████████████████████████████████████████████                                                                                               | 1.41G/3.99G [00:02<00:02, 921MB/s]model-00002-of-00003.safetensors:  40%|██████████████████████████████████████████████████████████▏                                                                                        | 1.58G/3.99G [00:02<00:02, 949MB/s]model-00002-of-00003.safetensors:  43%|███████████████████████████████████████████████████████████████                                                                                    | 1.71G/3.99G [00:03<00:02, 935MB/s]model-00002-of-00003.safetensors:  46%|████████████████████████████████████████████████████████████████████                                                                               | 1.85G/3.99G [00:03<00:02, 943MB/s]model-00002-of-00003.safetensors:  50%|████████████████████████████████████████████████████████████████████████▉                                                                          | 1.98G/3.99G [00:03<00:02, 982MB/s]model-00002-of-00003.safetensors:  53%|█████████████████████████████████████████████████████████████████████████████▉                                                                     | 2.11G/3.99G [00:03<00:01, 996MB/s]model-00002-of-00003.safetensors:  56%|██████████████████████████████████████████████████████████████████████████████████▉                                                                | 2.25G/3.99G [00:03<00:01, 980MB/s]model-00002-of-00003.safetensors:  60%|███████████████████████████████████████████████████████████████████████████████████████▏                                                          | 2.38G/3.99G [00:03<00:01, 1.00GB/s]model-00002-of-00003.safetensors:  63%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 2.52G/3.99G [00:03<00:01, 1.00GB/s]model-00002-of-00003.safetensors:  66%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 2.65G/3.99G [00:04<00:01, 1.01GB/s]model-00002-of-00003.safetensors:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 2.78G/3.99G [00:04<00:01, 957MB/s]model-00002-of-00003.safetensors:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 2.92G/3.99G [00:04<00:01, 1.02GB/s]model-00002-of-00003.safetensors:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 3.05G/3.99G [00:04<00:00, 1.01GB/s]model-00002-of-00003.safetensors:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 3.18G/3.99G [00:04<00:00, 1.02GB/s]model-00002-of-00003.safetensors:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 3.32G/3.99G [00:04<00:00, 993MB/s]model-00002-of-00003.safetensors:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 3.45G/3.99G [00:04<00:00, 984MB/s]model-00002-of-00003.safetensors:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 3.59G/3.99G [00:04<00:00, 1.00GB/s]model-00002-of-00003.safetensors:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 3.72G/3.99G [00:05<00:00, 991MB/s]model-00002-of-00003.safetensors:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 3.85G/3.99G [00:05<00:00, 988MB/s]model-00002-of-00003.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.99G/3.99G [00:05<00:00, 1.01GB/s]model-00002-of-00003.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.99G/3.99G [00:05<00:00, 743MB/s]
model-00003-of-00003.safetensors:   0%|                                                                                                                                                           | 0.00/99.6M [00:00<?, ?B/s]model-00003-of-00003.safetensors:  33%|███████████████████████████████████████████████▋                                                                                                  | 32.6M/99.6M [00:00<00:01, 49.6MB/s]model-00003-of-00003.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99.6M/99.6M [00:00<00:00, 114MB/s]model-00003-of-00003.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99.6M/99.6M [00:00<00:00, 101MB/s]
Loading checkpoint shards:   0%|                                                                                                                                                                        | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 2/3 [00:00<00:00, 20.00it/s]Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.22it/s]
generation_config.json:   0%|                                                                                                                                                                       | 0.00/238 [00:00<?, ?B/s]generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 238/238 [00:00<00:00, 938kB/s]
[2026-01-03 15:17:57,914] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:284] Converting modules to torch.bfloat16
[2026-01-03 15:17:59,169] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:284] Memory usage after model load 0.000GB ()
[2026-01-03 15:18:21,194] [INFO] [axolotl.train.save_initial_configs:417] [PID:284] Pre-saving tokenizer to ./tieto-code-mini-4b-instruct...
[2026-01-03 15:18:21,607] [INFO] [axolotl.train.save_initial_configs:422] [PID:284] Pre-saving model config to ./tieto-code-mini-4b-instruct...
[2026-01-03 15:18:21,614] [INFO] [axolotl.train.execute_training:212] [PID:284] Starting trainer...
[2026-01-03 15:18:24,659] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.132145643234253
[2026-01-03 15:18:25,818] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1583445072174072
[2026-01-03 15:18:26,866] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0472462177276611
[2026-01-03 15:18:27,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0393366813659668
[2026-01-03 15:18:27,906] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [10]
  0%|                                                                                                                                                                                                   | 0/6 [00:00<?, ?it/s][2026-01-03 15:18:28,025] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:18:30,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0382018089294434
[2026-01-03 15:18:31,265] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1572446823120117
[2026-01-03 15:18:32,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0719947814941406
[2026-01-03 15:18:33,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.0679423809051514
[2026-01-03 15:18:33,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 3.1398396492004395, 'eval_runtime': 2.9112, 'eval_samples_per_second': 8.931, 'eval_steps_per_second': 4.465, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 30.84, 'memory/max_allocated (GiB)': 30.84, 'memory/device_reserved (GiB)': 32.58, 'epoch': 0}
  0%|                                                                                                                                                                                                   | 0/6 [00:08<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 116.42it/s][A
                                                                                                                                                                                                                              [A 17%|███████████████████████████████▏                                                                                                                                                           | 1/6 [00:18<01:31, 18.36s/it]                                                                                                                                                                                                                              {'loss': 3.1865, 'grad_norm': 37.0, 'learning_rate': 0.0, 'ppl': 24.2036, 'memory/max_active (GiB)': 46.07, 'memory/max_allocated (GiB)': 46.07, 'memory/device_reserved (GiB)': 51.24, 'tokens_per_second_per_gpu': 8211.09, 'total_tokens': 54696, 'epoch': 0.4}
 17%|███████████████████████████████▏                                                                                                                                                           | 1/6 [00:18<01:31, 18.36s/it][2026-01-03 15:18:46,390] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:18:48,760] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.101508617401123
[2026-01-03 15:18:49,874] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1127257347106934
[2026-01-03 15:18:50,996] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1217362880706787
[2026-01-03 15:18:52,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1201398372650146
[2026-01-03 15:18:52,117] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 3.1398396492004395, 'eval_runtime': 1.88, 'eval_samples_per_second': 13.83, 'eval_steps_per_second': 6.915, 'eval_ppl': 23.1002, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 51.24, 'epoch': 0.4}
 17%|███████████████████████████████▏                                                                                                                                                           | 1/6 [00:25<01:31, 18.36s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.14it/s][A
                                                                                                                                                                                                                              [A 33%|██████████████████████████████████████████████████████████████▎                                                                                                                            | 2/6 [00:32<01:03, 15.79s/it]                                                                                                                                                                                                                              {'loss': 3.2114, 'grad_norm': 37.25, 'learning_rate': 2.0000000000000003e-06, 'ppl': 24.8138, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.51, 'tokens_per_second_per_gpu': 7540.09, 'total_tokens': 109425, 'epoch': 0.8}
 33%|██████████████████████████████████████████████████████████████▎                                                                                                                            | 2/6 [00:32<01:03, 15.79s/it][2026-01-03 15:19:00,384] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:19:02,683] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1525053977966309
[2026-01-03 15:19:03,815] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.131868600845337
[2026-01-03 15:19:04,916] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.099958896636963
[2026-01-03 15:19:06,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.1644210815429688
[2026-01-03 15:19:06,081] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 3.0861029624938965, 'eval_runtime': 1.9265, 'eval_samples_per_second': 13.496, 'eval_steps_per_second': 6.748, 'eval_ppl': 21.8916, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.51, 'epoch': 0.8}
 33%|██████████████████████████████████████████████████████████████▎                                                                                                                            | 2/6 [00:39<01:03, 15.79s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.09it/s][A
                                                                                                                                                                                                                              [A[2026-01-03 15:19:08,022] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-2
 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 3/6 [01:44<02:04, 41.57s/it]                                                                                                                                                                                                                              {'loss': 3.1362, 'grad_norm': 35.5, 'learning_rate': 4.000000000000001e-06, 'ppl': 23.0162, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5555.06, 'total_tokens': 130768, 'epoch': 1.0}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 3/6 [01:44<02:04, 41.57s/it][2026-01-03 15:20:12,627] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:20:15,379] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.314366102218628
[2026-01-03 15:20:16,662] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2827715873718262
[2026-01-03 15:20:17,967] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.304215431213379
[2026-01-03 15:20:19,242] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2750258445739746
[2026-01-03 15:20:19,243] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 2.908294916152954, 'eval_runtime': 2.149, 'eval_samples_per_second': 12.099, 'eval_steps_per_second': 6.049, 'eval_ppl': 18.3255, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.0}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 3/6 [01:53<02:04, 41.57s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.02it/s][A
                                                                                                                                                                                                                              [A 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 4/6 [02:03<01:05, 32.65s/it]                                                                                                                                                                                                                              {'loss': 2.9691, 'grad_norm': 31.125, 'learning_rate': 6e-06, 'ppl': 19.4744, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7579.19, 'total_tokens': 185778, 'epoch': 1.4}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 4/6 [02:03<01:05, 32.65s/it][2026-01-03 15:20:31,608] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:20:34,397] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3423559665679932
[2026-01-03 15:20:35,906] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5087320804595947
[2026-01-03 15:20:37,385] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.478161096572876
[2026-01-03 15:20:38,712] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3267621994018555
[2026-01-03 15:20:38,712] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 2.72947359085083, 'eval_runtime': 2.1836, 'eval_samples_per_second': 11.907, 'eval_steps_per_second': 5.953, 'eval_ppl': 15.3248, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.4}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 4/6 [02:12<01:05, 32.65s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.22it/s][A
                                                                                                                                                                                                                              [A[2026-01-03 15:20:40,912] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-4
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 5/6 [03:19<00:48, 48.24s/it]                                                                                                                                                                                                                              {'loss': 2.7408, 'grad_norm': 22.125, 'learning_rate': 8.000000000000001e-06, 'ppl': 15.4994, 'memory/max_active (GiB)': 55.84, 'memory/max_allocated (GiB)': 55.84, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 7537.94, 'total_tokens': 240377, 'epoch': 1.8}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 5/6 [03:19<00:48, 48.24s/it][2026-01-03 15:21:47,499] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:21:50,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.5038812160491943
[2026-01-03 15:21:51,792] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.288691759109497
[2026-01-03 15:21:53,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2864303588867188
[2026-01-03 15:21:54,394] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.3142154216766357
[2026-01-03 15:21:54,394] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 2.5420169830322266, 'eval_runtime': 2.3069, 'eval_samples_per_second': 11.27, 'eval_steps_per_second': 5.635, 'eval_ppl': 12.7053, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 1.8}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 5/6 [03:28<00:48, 48.24s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.04it/s][A
                                                                                                                                                                                                                              [A100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:31<00:00, 35.95s/it]                                                                                                                                                                                                                              {'loss': 2.5747, 'grad_norm': 11.25, 'learning_rate': 1e-05, 'ppl': 13.1274, 'memory/max_active (GiB)': 48.34, 'memory/max_allocated (GiB)': 48.34, 'memory/device_reserved (GiB)': 60.52, 'tokens_per_second_per_gpu': 5517.82, 'total_tokens': 261536, 'epoch': 2.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:31<00:00, 35.95s/it][2026-01-03 15:21:59,590] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:284] Running evaluation step...
[2026-01-03 15:22:02,502] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2924823760986328
[2026-01-03 15:22:03,963] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.4604885578155518
[2026-01-03 15:22:05,262] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2980804443359375
[2026-01-03 15:22:06,512] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:284] generate_batches time: 1.2491240501403809
[2026-01-03 15:22:06,512] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:284] gather_len_batches: [1]

  0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A                                                                                                                                                                                                                              
                                                                                                                                                                                                                              [A{'eval_loss': 2.455676555633545, 'eval_runtime': 2.0695, 'eval_samples_per_second': 12.563, 'eval_steps_per_second': 6.282, 'eval_ppl': 11.6543, 'memory/max_active (GiB)': 40.65, 'memory/max_allocated (GiB)': 40.65, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:40<00:00, 35.95s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.90it/s][A
                                                                                                                                                                                                                              [A[2026-01-03 15:22:08,596] [INFO] [axolotl.core.trainers.base._save:692] [PID:284] Saving model checkpoint to ./tieto-code-mini-4b-instruct/checkpoint-6
                                                                                                                                                                                                                              {'train_runtime': 279.4175, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.021, 'train_loss': 2.9697999954223633, 'memory/max_active (GiB)': 17.39, 'memory/max_allocated (GiB)': 17.39, 'memory/device_reserved (GiB)': 60.52, 'epoch': 2.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [04:39<00:00, 35.95s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [04:39<00:00, 46.57s/it]
[2026-01-03 15:23:07,487] [INFO] [axolotl.train.save_trained_model:233] [PID:284] Training completed! Saving trained model to ./tieto-code-mini-4b-instruct.
[2026-01-03 15:23:37,492] [INFO] [axolotl.train.save_trained_model:351] [PID:284] Model successfully saved to ./tieto-code-mini-4b-instruct
[0m