[2026-03-27 09:52:09,822] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:64102] bf16 support detected, enabling for this configuration.
[2026-03-27 09:52:09,985] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:64102] baseline 0.000GB ()
[2026-03-27 09:52:09,985] [INFO] [axolotl.cli.config.load_cfg:341] [PID:64102] config:
{
"activation_offloading": false,
"axolotl_config_path": "ministral3-3b-qlora.yaml",
"base_model": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
"base_model_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
"batch_size": 1,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_89",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 0,
"dataset_num_proc": 16,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "AlexHung29629/test_data_123",
"trust_remote_code": false,
"type": {
"field_instruction": "input",
"field_output": "output",
"field_system": "system",
"format": "{instruction}",
"no_input_format": "{instruction}",
"system_prompt": ""
}
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 1,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"evals_per_epoch": 1,
"experimental_skip_move_to_device": true,
"flex_attention": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": true,
"include_tkps": true,
"is_multimodal": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_dropout": 0.0,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "constant",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 1,
"model_config_type": "mistral3",
"model_config_type_text": "ministral3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_bnb_8bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/out",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"processor_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.5,
"saves_per_epoch": 1,
"scaling_softmax": true,
"sequence_len": 32768,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": false,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
"tokenizer_save_jinja_files": true,
"tokenizer_use_mistral_common": false,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"unfrozen_parameters": [
"^model.language_model.norm.weight$",
"^model.language_model.layers.2[0-5].[.a-z_]+$"
],
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.0,
"weight_decay": 0.0,
"world_size": 1
}
[2026-03-27 09:52:10,190] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:64102] Loaded image size: 1540 from model config
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 /
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 /
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 /
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 /
[2026-03-27 09:52:11,874] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:64102] Loading prepared dataset from disk at last_run_prepared/0a6d77d9f0fbd2dd6692eaf810500a77...
[2026-03-27 09:52:11,880] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:64102] total_num_tokens: 336_571
[2026-03-27 09:52:11,881] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:64102] `total_supervised_tokens: 3_275`
[2026-03-27 09:52:14,284] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9476313591003418
[2026-03-27 09:52:15,197] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9123454093933105
[2026-03-27 09:52:16,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9608397483825684
[2026-03-27 09:52:17,093] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9345409870147705
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:64102] data_loader_len: 12
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:64102] sample_packing_eff_est across ranks: [0.9337574351917614]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:64102] sample_packing_eff_est: 0.94
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:64102] total_num_steps: 24
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:64102] Maximum number of steps set at 24
[2026-03-27 09:52:17,148] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:64102] loading tokenizer... mistralai/Ministral-3-3B-Instruct-2512-BF16
[2026-03-27 09:52:19,018] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 /
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 /
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 /
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 /
[2026-03-27 09:52:23,670] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:64102] Loading model
[2026-03-27 09:52:23,794] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:64102] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-27 09:52:23,796] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:64102] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-27 09:52:23,797] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:402] [PID:64102] Applying multipack dataloader patch for sample packing...
[2026-03-27 09:52:23,820] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:64102] Applying Cut Cross Entropy to model type: mistral3
Loading weights: 0%| | 0/458 [00:00, ?it/s]
Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:00<00:00, 5938.85it/s]
[2026-03-27 09:52:25,202] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:64102] Converting modules to torch.bfloat16
[2026-03-27 09:52:25,776] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:64102] Memory usage after model load 0.000GB ()
[2026-03-27 09:52:26,681] [INFO] [axolotl.monkeypatch.scaled_softmax_attn.patch_scaled_softmax_attention:46] [PID:64102] Patched flex_attention with SSMax (s=0.43, b=0.0)
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.norm.weight
model.language_model.layers.20.self_attn.q_proj.weight
model.language_model.layers.20.self_attn.k_proj.weight
model.language_model.layers.20.self_attn.v_proj.weight
model.language_model.layers.20.self_attn.o_proj.weight
model.language_model.layers.20.mlp.gate_proj.weight
model.language_model.layers.20.mlp.up_proj.weight
model.language_model.layers.20.mlp.down_proj.weight
model.language_model.layers.20.input_layernorm.weight
model.language_model.layers.20.post_attention_layernorm.weight
model.language_model.layers.21.self_attn.q_proj.weight
model.language_model.layers.21.self_attn.k_proj.weight
model.language_model.layers.21.self_attn.v_proj.weight
model.language_model.layers.21.self_attn.o_proj.weight
model.language_model.layers.21.mlp.gate_proj.weight
model.language_model.layers.21.mlp.up_proj.weight
model.language_model.layers.21.mlp.down_proj.weight
model.language_model.layers.21.input_layernorm.weight
model.language_model.layers.21.post_attention_layernorm.weight
model.language_model.layers.22.self_attn.q_proj.weight
model.language_model.layers.22.self_attn.k_proj.weight
model.language_model.layers.22.self_attn.v_proj.weight
model.language_model.layers.22.self_attn.o_proj.weight
model.language_model.layers.22.mlp.gate_proj.weight
model.language_model.layers.22.mlp.up_proj.weight
model.language_model.layers.22.mlp.down_proj.weight
model.language_model.layers.22.input_layernorm.weight
model.language_model.layers.22.post_attention_layernorm.weight
model.language_model.layers.23.self_attn.q_proj.weight
model.language_model.layers.23.self_attn.k_proj.weight
model.language_model.layers.23.self_attn.v_proj.weight
model.language_model.layers.23.self_attn.o_proj.weight
model.language_model.layers.23.mlp.gate_proj.weight
model.language_model.layers.23.mlp.up_proj.weight
model.language_model.layers.23.mlp.down_proj.weight
model.language_model.layers.23.input_layernorm.weight
model.language_model.layers.23.post_attention_layernorm.weight
model.language_model.layers.24.self_attn.q_proj.weight
model.language_model.layers.24.self_attn.k_proj.weight
model.language_model.layers.24.self_attn.v_proj.weight
model.language_model.layers.24.self_attn.o_proj.weight
model.language_model.layers.24.mlp.gate_proj.weight
model.language_model.layers.24.mlp.up_proj.weight
model.language_model.layers.24.mlp.down_proj.weight
model.language_model.layers.24.input_layernorm.weight
model.language_model.layers.24.post_attention_layernorm.weight
model.language_model.layers.25.self_attn.q_proj.weight
model.language_model.layers.25.self_attn.k_proj.weight
model.language_model.layers.25.self_attn.v_proj.weight
model.language_model.layers.25.self_attn.o_proj.weight
model.language_model.layers.25.mlp.gate_proj.weight
model.language_model.layers.25.mlp.up_proj.weight
model.language_model.layers.25.mlp.down_proj.weight
model.language_model.layers.25.input_layernorm.weight
model.language_model.layers.25.post_attention_layernorm.weight
model.language_model.norm.weight
[2026-03-27 09:52:30,361] [INFO] [axolotl.train.save_initial_configs:421] [PID:64102] Pre-saving tokenizer to ./outputs/out...
[2026-03-27 09:52:30,482] [INFO] [axolotl.train.save_initial_configs:426] [PID:64102] Pre-saving model config to ./outputs/out...
[2026-03-27 09:52:30,484] [INFO] [axolotl.train.save_initial_configs:430] [PID:64102] Pre-saving processor to ./outputs/out...
[2026-03-27 09:52:31,014] [INFO] [axolotl.train.execute_training:222] [PID:64102] Starting trainer...
[2026-03-27 09:52:33,177] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.922748327255249
[2026-03-27 09:52:34,095] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9173660278320312
[2026-03-27 09:52:34,994] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.8986146450042725
[2026-03-27 09:52:35,943] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9490597248077393
[2026-03-27 09:52:35,943] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]
0%| | 0/24 [00:00, ?it/s][2026-03-27 09:52:39,230] [WARNING] [py.warnings._showwarnmsg:110] [PID:64102] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
_warn_once(
4%|███████▎ | 1/24 [00:18<06:55, 18.09s/it]
{'loss': '7.497', 'grad_norm': '235', 'learning_rate': '2e-05', 'ppl': '1802', 'memory/max_active (GiB)': '17.98', 'memory/max_allocated (GiB)': '17.98', 'memory/device_reserved (GiB)': '18.96', 'tokens/train_per_sec_per_gpu': '92.75', 'tokens/total': 32768, 'tokens/trainable': 1575, 'epoch': '0.08333'}
4%|███████▎ | 1/24 [00:18<06:55, 18.09s/it]
8%|██████████████▋ | 2/24 [00:29<05:12, 14.22s/it]
{'loss': '4.327', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '75.75', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.94', 'tokens/total': 65536, 'tokens/trainable': 1815, 'epoch': '0.1667'}
8%|██████████████▋ | 2/24 [00:29<05:12, 14.22s/it]
12%|██████████████████████ | 3/24 [00:44<05:01, 14.33s/it]
{'loss': '3.383', 'grad_norm': '276', 'learning_rate': '2e-05', 'ppl': '29.46', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.705', 'tokens/total': 98304, 'tokens/trainable': 1854, 'epoch': '0.25'}
12%|██████████████████████ | 3/24 [00:44<05:01, 14.33s/it]
17%|█████████████████████████████▎ | 4/24 [00:58<04:47, 14.39s/it]
{'loss': '4.797', 'grad_norm': '111', 'learning_rate': '2e-05', 'ppl': '121.1', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.89', 'tokens/total': 131072, 'tokens/trainable': 2141, 'epoch': '0.3333'}
17%|█████████████████████████████▎ | 4/24 [00:58<04:47, 14.39s/it]
21%|████████████████████████████████████▋ | 5/24 [01:06<03:52, 12.24s/it]
{'loss': '2.682', 'grad_norm': '54', 'learning_rate': '2e-05', 'ppl': '14.61', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '44.22', 'tokens/total': 163840, 'tokens/trainable': 2512, 'epoch': '0.4167'}
21%|████████████████████████████████████▋ | 5/24 [01:06<03:52, 12.24s/it]
25%|████████████████████████████████████████████ | 6/24 [01:21<03:54, 13.03s/it]
{'loss': '3.572', 'grad_norm': '90', 'learning_rate': '2e-05', 'ppl': '35.58', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.84', 'tokens/total': 196608, 'tokens/trainable': 2771, 'epoch': '0.5'}
25%|████████████████████████████████████████████ | 6/24 [01:21<03:54, 13.03s/it]
29%|███████████████████████████████████████████████████▎ | 7/24 [01:35<03:44, 13.18s/it]
{'loss': '1.781', 'grad_norm': '202', 'learning_rate': '2e-05', 'ppl': '5.938', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.935', 'tokens/total': 229376, 'tokens/trainable': 2797, 'epoch': '0.5833'}
29%|███████████████████████████████████████████████████▎ | 7/24 [01:35<03:44, 13.18s/it]
33%|██████████████████████████████████████████████████████████▋ | 8/24 [01:44<03:14, 12.15s/it]
{'loss': '3.93', 'grad_norm': '55.75', 'learning_rate': '2e-05', 'ppl': '50.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.41', 'tokens/total': 262144, 'tokens/trainable': 2999, 'epoch': '0.6667'}
33%|██████████████████████████████████████████████████████████▋ | 8/24 [01:44<03:14, 12.15s/it]
38%|██████████████████████████████████████████████████████████████████ | 9/24 [01:59<03:14, 12.99s/it]
{'loss': '3.408', 'grad_norm': '233', 'learning_rate': '2e-05', 'ppl': '30.21', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.624', 'tokens/total': 294912, 'tokens/trainable': 3023, 'epoch': '0.75'}
38%|██████████████████████████████████████████████████████████████████ | 9/24 [01:59<03:14, 12.99s/it]
42%|████████████████████████████████████████████████████████████████████████▉ | 10/24 [02:13<03:05, 13.25s/it]
{'loss': '1.711', 'grad_norm': '214', 'learning_rate': '2e-05', 'ppl': '5.535', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.03', 'tokens/total': 327680, 'tokens/trainable': 3051, 'epoch': '0.8333'}
42%|████████████████████████████████████████████████████████████████████████▉ | 10/24 [02:13<03:05, 13.25s/it]
46%|████████████████████████████████████████████████████████████████████████████████▏ | 11/24 [02:27<02:54, 13.39s/it]
{'loss': '1.723', 'grad_norm': '166', 'learning_rate': '2e-05', 'ppl': '5.604', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.637', 'tokens/total': 360448, 'tokens/trainable': 3087, 'epoch': '0.9167'}
46%|████████████████████████████████████████████████████████████████████████████████▏ | 11/24 [02:27<02:54, 13.39s/it]
50%|███████████████████████████████████████████████████████████████████████████████████████▌ | 12/24 [02:42<02:48, 14.00s/it]
{'loss': '4.694', 'grad_norm': '94.5', 'learning_rate': '2e-05', 'ppl': '109.3', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.2', 'tokens/total': 393216, 'tokens/trainable': 3275, 'epoch': '1'}
50%|███████████████████████████████████████████████████████████████████████████████████████▌ | 12/24 [02:42<02:48, 14.00s/it][2026-03-27 09:55:18,727] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-12
Writing model shards: 0%| | 0/1 [00:00, ?it/s][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.19s/it][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.19s/it]
54%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 13/24 [03:08<03:11, 17.45s/it]
{'loss': '1.37', 'grad_norm': '53.25', 'learning_rate': '2e-05', 'ppl': '3.936', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '10.1', 'tokens/total': 425984, 'tokens/trainable': 3411, 'epoch': '1.083'}
54%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 13/24 [03:08<03:11, 17.45s/it]
58%|██████████████████████████████████████████████████████████████████████████████████████████████████████ | 14/24 [03:16<02:27, 14.79s/it]
{'loss': '1.797', 'grad_norm': '22.75', 'learning_rate': '2e-05', 'ppl': '6.033', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '69.67', 'tokens/total': 458752, 'tokens/trainable': 4011, 'epoch': '1.167'}
58%|██████████████████████████████████████████████████████████████████████████████████████████████████████ | 14/24 [03:16<02:27, 14.79s/it]
62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 15/24 [03:30<02:10, 14.48s/it]
{'loss': '0.4977', 'grad_norm': '110.5', 'learning_rate': '2e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.045', 'tokens/total': 491520, 'tokens/trainable': 4039, 'epoch': '1.25'}
62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 15/24 [03:30<02:10, 14.48s/it]
67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 16/24 [03:45<01:57, 14.75s/it]
{'loss': '2.569', 'grad_norm': '52.5', 'learning_rate': '2e-05', 'ppl': '13.05', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.26', 'tokens/total': 524288, 'tokens/trainable': 4227, 'epoch': '1.333'}
67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 16/24 [03:45<01:57, 14.75s/it]
71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 17/24 [03:59<01:41, 14.48s/it]
{'loss': '0.1539', 'grad_norm': '23.75', 'learning_rate': '2e-05', 'ppl': '1.166', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '3.55', 'tokens/total': 557056, 'tokens/trainable': 4276, 'epoch': '1.417'}
71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 17/24 [03:59<01:41, 14.48s/it]
75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 18/24 [04:14<01:27, 14.54s/it]
{'loss': '1.552', 'grad_norm': '41.25', 'learning_rate': '2e-05', 'ppl': '4.722', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.7', 'tokens/total': 589824, 'tokens/trainable': 4535, 'epoch': '1.5'}
75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 18/24 [04:14<01:27, 14.54s/it]
79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 19/24 [04:29<01:13, 14.64s/it]
{'loss': '0.6448', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.906', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.618', 'tokens/total': 622592, 'tokens/trainable': 4559, 'epoch': '1.583'}
79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 19/24 [04:29<01:13, 14.64s/it]
83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 20/24 [04:44<00:58, 14.74s/it]
{'loss': '0.6573', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.93', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.617', 'tokens/total': 655360, 'tokens/trainable': 4598, 'epoch': '1.667'}
83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 20/24 [04:44<00:58, 14.74s/it]
88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 21/24 [04:56<00:41, 13.93s/it]
{'loss': '1.997', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '7.369', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '8.91', 'tokens/total': 688128, 'tokens/trainable': 4705, 'epoch': '1.75'}
88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 21/24 [04:56<00:41, 13.93s/it]
92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 22/24 [05:11<00:28, 14.21s/it]
{'loss': '2.558', 'grad_norm': '59.75', 'learning_rate': '2e-05', 'ppl': '12.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.36', 'tokens/total': 720896, 'tokens/trainable': 4992, 'epoch': '1.833'}
92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 22/24 [05:11<00:28, 14.21s/it]
96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 23/24 [05:22<00:13, 13.41s/it]
{'loss': '2.735', 'grad_norm': '60.5', 'learning_rate': '2e-05', 'ppl': '15.41', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '7.224', 'tokens/total': 753664, 'tokens/trainable': 5075, 'epoch': '1.917'}
96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 23/24 [05:22<00:13, 13.41s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:37<00:00, 13.76s/it]
{'loss': '4.172', 'grad_norm': '46.5', 'learning_rate': '2e-05', 'ppl': '64.87', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '101.3', 'tokens/total': 786432, 'tokens/trainable': 6550, 'epoch': '2'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:37<00:00, 13.76s/it][2026-03-27 09:58:13,284] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-24
Writing model shards: 0%| | 0/1 [00:00, ?it/s][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.19s/it][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.19s/it]
{'train_runtime': '347.4', 'train_samples_per_second': '0.069', 'train_steps_per_second': '0.069', 'train_loss': '2.675', 'memory/max_active (GiB)': '8.52', 'memory/max_allocated (GiB)': '8.52', 'memory/device_reserved (GiB)': '20.65', 'epoch': '2', 'tokens/train_per_sec_per_gpu': '0'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:47<00:00, 13.76s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:47<00:00, 14.47s/it]
[2026-03-27 09:58:23,361] [INFO] [axolotl.train.save_trained_model:241] [PID:64102] Training completed! Saving trained model to ./outputs/out.
Writing model shards: 0%| | 0/1 [00:00, ?it/s]
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.04s/it]
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00, 9.04s/it]
[2026-03-27 09:58:32,450] [INFO] [axolotl.train.save_trained_model:355] [PID:64102] Model successfully saved to ./outputs/out