[2026-03-27 09:52:09,822] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:64102] bf16 support detected, enabling for this configuration.
[2026-03-27 09:52:09,985] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:64102] baseline 0.000GB ()
[2026-03-27 09:52:09,985] [INFO] [axolotl.cli.config.load_cfg:341] [PID:64102] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "ministral3-3b-qlora.yaml",
  "base_model": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "base_model_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "batch_size": 1,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_89",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 0,
  "dataset_num_proc": 16,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "AlexHung29629/test_data_123",
      "trust_remote_code": false,
      "type": {
        "field_instruction": "input",
        "field_output": "output",
        "field_system": "system",
        "format": "{instruction}",
        "no_input_format": "{instruction}",
        "system_prompt": ""
      }
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flex_attention": true,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "include_tkps": true,
  "is_multimodal": true,
  "layer_offloading": false,
  "learning_rate": 2e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 1,
  "model_config_type": "mistral3",
  "model_config_type_text": "ministral3",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_bnb_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/out",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.5,
  "saves_per_epoch": 1,
  "scaling_softmax": true,
  "sequence_len": 32768,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "mistralai/Ministral-3-3B-Instruct-2512-BF16",
  "tokenizer_save_jinja_files": true,
  "tokenizer_use_mistral_common": false,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "unfrozen_parameters": [
    "^model.language_model.norm.weight$",
    "^model.language_model.layers.2[0-5].[.a-z_]+$"
  ],
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.0,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-03-27 09:52:10,190] [DEBUG] [axolotl.loaders.utils.check_model_config:88] [PID:64102] Loaded image size: 1540 from model config
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 / </s>
[2026-03-27 09:52:11,872] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 / <s>
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 / <pad>
[2026-03-27 09:52:11,873] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 / <unk>
[2026-03-27 09:52:11,874] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:64102] Loading prepared dataset from disk at last_run_prepared/0a6d77d9f0fbd2dd6692eaf810500a77...
[2026-03-27 09:52:11,880] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:64102] total_num_tokens: 336_571
[2026-03-27 09:52:11,881] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:64102] `total_supervised_tokens: 3_275`
[2026-03-27 09:52:14,284] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9476313591003418
[2026-03-27 09:52:15,197] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9123454093933105
[2026-03-27 09:52:16,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9608397483825684
[2026-03-27 09:52:17,093] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9345409870147705
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:64102] data_loader_len: 12
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:64102] sample_packing_eff_est across ranks: [0.9337574351917614]
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:64102] sample_packing_eff_est: 0.94
[2026-03-27 09:52:17,119] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:64102] total_num_steps: 24
[2026-03-27 09:52:17,119] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:64102] Maximum number of steps set at 24
[2026-03-27 09:52:17,148] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:64102] loading tokenizer... mistralai/Ministral-3-3B-Instruct-2512-BF16
[2026-03-27 09:52:19,018] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:64102] EOS: 2 / </s>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:64102] BOS: 1 / <s>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:64102] PAD: 11 / <pad>
[2026-03-27 09:52:19,019] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:64102] UNK: 0 / <unk>
[2026-03-27 09:52:23,670] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:64102] Loading model
[2026-03-27 09:52:23,794] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:64102] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-27 09:52:23,796] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:64102] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-27 09:52:23,797] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:402] [PID:64102] Applying multipack dataloader patch for sample packing...
[2026-03-27 09:52:23,820] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:64102] Applying Cut Cross Entropy to model type: mistral3
Loading weights:   0%|                                                                                                                                                                      | 0/458 [00:00<?, ?it/s]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:00<00:00, 5938.85it/s]
[2026-03-27 09:52:25,202] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:64102] Converting modules to torch.bfloat16
[2026-03-27 09:52:25,776] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:64102] Memory usage after model load 0.000GB ()
[2026-03-27 09:52:26,681] [INFO] [axolotl.monkeypatch.scaled_softmax_attn.patch_scaled_softmax_attention:46] [PID:64102] Patched flex_attention with SSMax (s=0.43, b=0.0)
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.20.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.21.post_attention_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.q_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.k_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.v_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.self_attn.o_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.gate_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.up_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.mlp.down_proj.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.input_layernorm.weight
[2026-03-27 09:52:26,685] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.22.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.23.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.24.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.q_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.k_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.v_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.self_attn.o_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.gate_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.up_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.mlp.down_proj.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.input_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.layers.25.post_attention_layernorm.weight
[2026-03-27 09:52:26,686] [DEBUG] [axolotl.utils.freeze.freeze_layers_except:56] [PID:64102] Unfrozen model.language_model.norm.weight
model.language_model.layers.20.self_attn.q_proj.weight
model.language_model.layers.20.self_attn.k_proj.weight
model.language_model.layers.20.self_attn.v_proj.weight
model.language_model.layers.20.self_attn.o_proj.weight
model.language_model.layers.20.mlp.gate_proj.weight
model.language_model.layers.20.mlp.up_proj.weight
model.language_model.layers.20.mlp.down_proj.weight
model.language_model.layers.20.input_layernorm.weight
model.language_model.layers.20.post_attention_layernorm.weight
model.language_model.layers.21.self_attn.q_proj.weight
model.language_model.layers.21.self_attn.k_proj.weight
model.language_model.layers.21.self_attn.v_proj.weight
model.language_model.layers.21.self_attn.o_proj.weight
model.language_model.layers.21.mlp.gate_proj.weight
model.language_model.layers.21.mlp.up_proj.weight
model.language_model.layers.21.mlp.down_proj.weight
model.language_model.layers.21.input_layernorm.weight
model.language_model.layers.21.post_attention_layernorm.weight
model.language_model.layers.22.self_attn.q_proj.weight
model.language_model.layers.22.self_attn.k_proj.weight
model.language_model.layers.22.self_attn.v_proj.weight
model.language_model.layers.22.self_attn.o_proj.weight
model.language_model.layers.22.mlp.gate_proj.weight
model.language_model.layers.22.mlp.up_proj.weight
model.language_model.layers.22.mlp.down_proj.weight
model.language_model.layers.22.input_layernorm.weight
model.language_model.layers.22.post_attention_layernorm.weight
model.language_model.layers.23.self_attn.q_proj.weight
model.language_model.layers.23.self_attn.k_proj.weight
model.language_model.layers.23.self_attn.v_proj.weight
model.language_model.layers.23.self_attn.o_proj.weight
model.language_model.layers.23.mlp.gate_proj.weight
model.language_model.layers.23.mlp.up_proj.weight
model.language_model.layers.23.mlp.down_proj.weight
model.language_model.layers.23.input_layernorm.weight
model.language_model.layers.23.post_attention_layernorm.weight
model.language_model.layers.24.self_attn.q_proj.weight
model.language_model.layers.24.self_attn.k_proj.weight
model.language_model.layers.24.self_attn.v_proj.weight
model.language_model.layers.24.self_attn.o_proj.weight
model.language_model.layers.24.mlp.gate_proj.weight
model.language_model.layers.24.mlp.up_proj.weight
model.language_model.layers.24.mlp.down_proj.weight
model.language_model.layers.24.input_layernorm.weight
model.language_model.layers.24.post_attention_layernorm.weight
model.language_model.layers.25.self_attn.q_proj.weight
model.language_model.layers.25.self_attn.k_proj.weight
model.language_model.layers.25.self_attn.v_proj.weight
model.language_model.layers.25.self_attn.o_proj.weight
model.language_model.layers.25.mlp.gate_proj.weight
model.language_model.layers.25.mlp.up_proj.weight
model.language_model.layers.25.mlp.down_proj.weight
model.language_model.layers.25.input_layernorm.weight
model.language_model.layers.25.post_attention_layernorm.weight
model.language_model.norm.weight
[2026-03-27 09:52:30,361] [INFO] [axolotl.train.save_initial_configs:421] [PID:64102] Pre-saving tokenizer to ./outputs/out...
[2026-03-27 09:52:30,482] [INFO] [axolotl.train.save_initial_configs:426] [PID:64102] Pre-saving model config to ./outputs/out...
[2026-03-27 09:52:30,484] [INFO] [axolotl.train.save_initial_configs:430] [PID:64102] Pre-saving processor to ./outputs/out...
[2026-03-27 09:52:31,014] [INFO] [axolotl.train.execute_training:222] [PID:64102] Starting trainer...
[2026-03-27 09:52:33,177] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.922748327255249
[2026-03-27 09:52:34,095] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9173660278320312
[2026-03-27 09:52:34,994] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.8986146450042725
[2026-03-27 09:52:35,943] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:64102] generate_batches time: 0.9490597248077393
[2026-03-27 09:52:35,943] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:64102] gather_len_batches: [12]
  0%|                                                                                                                                                                                        | 0/24 [00:00<?, ?it/s][2026-03-27 09:52:39,230] [WARNING] [py.warnings._showwarnmsg:110] [PID:64102] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.
  _warn_once(

  4%|███████▎                                                                                                                                                                        | 1/24 [00:18<06:55, 18.09s/it]                                                                                                                                                                                                                    {'loss': '7.497', 'grad_norm': '235', 'learning_rate': '2e-05', 'ppl': '1802', 'memory/max_active (GiB)': '17.98', 'memory/max_allocated (GiB)': '17.98', 'memory/device_reserved (GiB)': '18.96', 'tokens/train_per_sec_per_gpu': '92.75', 'tokens/total': 32768, 'tokens/trainable': 1575, 'epoch': '0.08333'}
  4%|███████▎                                                                                                                                                                        | 1/24 [00:18<06:55, 18.09s/it]  8%|██████████████▋                                                                                                                                                                 | 2/24 [00:29<05:12, 14.22s/it]                                                                                                                                                                                                                    {'loss': '4.327', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '75.75', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.94', 'tokens/total': 65536, 'tokens/trainable': 1815, 'epoch': '0.1667'}
  8%|██████████████▋                                                                                                                                                                 | 2/24 [00:29<05:12, 14.22s/it] 12%|██████████████████████                                                                                                                                                          | 3/24 [00:44<05:01, 14.33s/it]                                                                                                                                                                                                                    {'loss': '3.383', 'grad_norm': '276', 'learning_rate': '2e-05', 'ppl': '29.46', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.705', 'tokens/total': 98304, 'tokens/trainable': 1854, 'epoch': '0.25'}
 12%|██████████████████████                                                                                                                                                          | 3/24 [00:44<05:01, 14.33s/it] 17%|█████████████████████████████▎                                                                                                                                                  | 4/24 [00:58<04:47, 14.39s/it]                                                                                                                                                                                                                    {'loss': '4.797', 'grad_norm': '111', 'learning_rate': '2e-05', 'ppl': '121.1', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.89', 'tokens/total': 131072, 'tokens/trainable': 2141, 'epoch': '0.3333'}
 17%|█████████████████████████████▎                                                                                                                                                  | 4/24 [00:58<04:47, 14.39s/it] 21%|████████████████████████████████████▋                                                                                                                                           | 5/24 [01:06<03:52, 12.24s/it]                                                                                                                                                                                                                    {'loss': '2.682', 'grad_norm': '54', 'learning_rate': '2e-05', 'ppl': '14.61', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '44.22', 'tokens/total': 163840, 'tokens/trainable': 2512, 'epoch': '0.4167'}
 21%|████████████████████████████████████▋                                                                                                                                           | 5/24 [01:06<03:52, 12.24s/it] 25%|████████████████████████████████████████████                                                                                                                                    | 6/24 [01:21<03:54, 13.03s/it]                                                                                                                                                                                                                    {'loss': '3.572', 'grad_norm': '90', 'learning_rate': '2e-05', 'ppl': '35.58', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.84', 'tokens/total': 196608, 'tokens/trainable': 2771, 'epoch': '0.5'}
 25%|████████████████████████████████████████████                                                                                                                                    | 6/24 [01:21<03:54, 13.03s/it] 29%|███████████████████████████████████████████████████▎                                                                                                                            | 7/24 [01:35<03:44, 13.18s/it]                                                                                                                                                                                                                    {'loss': '1.781', 'grad_norm': '202', 'learning_rate': '2e-05', 'ppl': '5.938', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.935', 'tokens/total': 229376, 'tokens/trainable': 2797, 'epoch': '0.5833'}
 29%|███████████████████████████████████████████████████▎                                                                                                                            | 7/24 [01:35<03:44, 13.18s/it] 33%|██████████████████████████████████████████████████████████▋                                                                                                                     | 8/24 [01:44<03:14, 12.15s/it]                                                                                                                                                                                                                    {'loss': '3.93', 'grad_norm': '55.75', 'learning_rate': '2e-05', 'ppl': '50.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '20.41', 'tokens/total': 262144, 'tokens/trainable': 2999, 'epoch': '0.6667'}
 33%|██████████████████████████████████████████████████████████▋                                                                                                                     | 8/24 [01:44<03:14, 12.15s/it] 38%|██████████████████████████████████████████████████████████████████                                                                                                              | 9/24 [01:59<03:14, 12.99s/it]                                                                                                                                                                                                                    {'loss': '3.408', 'grad_norm': '233', 'learning_rate': '2e-05', 'ppl': '30.21', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.624', 'tokens/total': 294912, 'tokens/trainable': 3023, 'epoch': '0.75'}
 38%|██████████████████████████████████████████████████████████████████                                                                                                              | 9/24 [01:59<03:14, 12.99s/it] 42%|████████████████████████████████████████████████████████████████████████▉                                                                                                      | 10/24 [02:13<03:05, 13.25s/it]                                                                                                                                                                                                                    {'loss': '1.711', 'grad_norm': '214', 'learning_rate': '2e-05', 'ppl': '5.535', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.03', 'tokens/total': 327680, 'tokens/trainable': 3051, 'epoch': '0.8333'}
 42%|████████████████████████████████████████████████████████████████████████▉                                                                                                      | 10/24 [02:13<03:05, 13.25s/it] 46%|████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 11/24 [02:27<02:54, 13.39s/it]                                                                                                                                                                                                                    {'loss': '1.723', 'grad_norm': '166', 'learning_rate': '2e-05', 'ppl': '5.604', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.637', 'tokens/total': 360448, 'tokens/trainable': 3087, 'epoch': '0.9167'}
 46%|████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 11/24 [02:27<02:54, 13.39s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 12/24 [02:42<02:48, 14.00s/it]                                                                                                                                                                                                                    {'loss': '4.694', 'grad_norm': '94.5', 'learning_rate': '2e-05', 'ppl': '109.3', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.2', 'tokens/total': 393216, 'tokens/trainable': 3275, 'epoch': '1'}
 50%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 12/24 [02:42<02:48, 14.00s/it][2026-03-27 09:55:18,727] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-12

Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.19s/it][AWriting model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.19s/it]
 54%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 13/24 [03:08<03:11, 17.45s/it]                                                                                                                                                                                                                    {'loss': '1.37', 'grad_norm': '53.25', 'learning_rate': '2e-05', 'ppl': '3.936', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '10.1', 'tokens/total': 425984, 'tokens/trainable': 3411, 'epoch': '1.083'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 13/24 [03:08<03:11, 17.45s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 14/24 [03:16<02:27, 14.79s/it]                                                                                                                                                                                                                    {'loss': '1.797', 'grad_norm': '22.75', 'learning_rate': '2e-05', 'ppl': '6.033', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '69.67', 'tokens/total': 458752, 'tokens/trainable': 4011, 'epoch': '1.167'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 14/24 [03:16<02:27, 14.79s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 15/24 [03:30<02:10, 14.48s/it]                                                                                                                                                                                                                    {'loss': '0.4977', 'grad_norm': '110.5', 'learning_rate': '2e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.045', 'tokens/total': 491520, 'tokens/trainable': 4039, 'epoch': '1.25'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 15/24 [03:30<02:10, 14.48s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 16/24 [03:45<01:57, 14.75s/it]                                                                                                                                                                                                                    {'loss': '2.569', 'grad_norm': '52.5', 'learning_rate': '2e-05', 'ppl': '13.05', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '12.26', 'tokens/total': 524288, 'tokens/trainable': 4227, 'epoch': '1.333'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 16/24 [03:45<01:57, 14.75s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 17/24 [03:59<01:41, 14.48s/it]                                                                                                                                                                                                                    {'loss': '0.1539', 'grad_norm': '23.75', 'learning_rate': '2e-05', 'ppl': '1.166', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '3.55', 'tokens/total': 557056, 'tokens/trainable': 4276, 'epoch': '1.417'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 17/24 [03:59<01:41, 14.48s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 18/24 [04:14<01:27, 14.54s/it]                                                                                                                                                                                                                    {'loss': '1.552', 'grad_norm': '41.25', 'learning_rate': '2e-05', 'ppl': '4.722', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '17.7', 'tokens/total': 589824, 'tokens/trainable': 4535, 'epoch': '1.5'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 18/24 [04:14<01:27, 14.54s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 19/24 [04:29<01:13, 14.64s/it]                                                                                                                                                                                                                    {'loss': '0.6448', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.906', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '1.618', 'tokens/total': 622592, 'tokens/trainable': 4559, 'epoch': '1.583'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 19/24 [04:29<01:13, 14.64s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 20/24 [04:44<00:58, 14.74s/it]                                                                                                                                                                                                                    {'loss': '0.6573', 'grad_norm': '96', 'learning_rate': '2e-05', 'ppl': '1.93', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '2.617', 'tokens/total': 655360, 'tokens/trainable': 4598, 'epoch': '1.667'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 20/24 [04:44<00:58, 14.74s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 21/24 [04:56<00:41, 13.93s/it]                                                                                                                                                                                                                    {'loss': '1.997', 'grad_norm': '84.5', 'learning_rate': '2e-05', 'ppl': '7.369', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '8.91', 'tokens/total': 688128, 'tokens/trainable': 4705, 'epoch': '1.75'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 21/24 [04:56<00:41, 13.93s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 22/24 [05:11<00:28, 14.21s/it]                                                                                                                                                                                                                    {'loss': '2.558', 'grad_norm': '59.75', 'learning_rate': '2e-05', 'ppl': '12.91', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '19.36', 'tokens/total': 720896, 'tokens/trainable': 4992, 'epoch': '1.833'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 22/24 [05:11<00:28, 14.21s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 23/24 [05:22<00:13, 13.41s/it]                                                                                                                                                                                                                    {'loss': '2.735', 'grad_norm': '60.5', 'learning_rate': '2e-05', 'ppl': '15.41', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '7.224', 'tokens/total': 753664, 'tokens/trainable': 5075, 'epoch': '1.917'}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 23/24 [05:22<00:13, 13.41s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:37<00:00, 13.76s/it]                                                                                                                                                                                                                    {'loss': '4.172', 'grad_norm': '46.5', 'learning_rate': '2e-05', 'ppl': '64.87', 'memory/max_active (GiB)': '19.31', 'memory/max_allocated (GiB)': '19.31', 'memory/device_reserved (GiB)': '20.65', 'tokens/train_per_sec_per_gpu': '101.3', 'tokens/total': 786432, 'tokens/trainable': 6550, 'epoch': '2'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:37<00:00, 13.76s/it][2026-03-27 09:58:13,284] [INFO] [axolotl.core.trainers.base._save:722] [PID:64102] Saving model checkpoint to ./outputs/out/checkpoint-24

Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.19s/it][AWriting model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.19s/it]
                                                                                                                                                                                                                    {'train_runtime': '347.4', 'train_samples_per_second': '0.069', 'train_steps_per_second': '0.069', 'train_loss': '2.675', 'memory/max_active (GiB)': '8.52', 'memory/max_allocated (GiB)': '8.52', 'memory/device_reserved (GiB)': '20.65', 'epoch': '2', 'tokens/train_per_sec_per_gpu': '0'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:47<00:00, 13.76s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [05:47<00:00, 14.47s/it]
[2026-03-27 09:58:23,361] [INFO] [axolotl.train.save_trained_model:241] [PID:64102] Training completed! Saving trained model to ./outputs/out.
Writing model shards:   0%|                                                                                                                                                                   | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.04s/it]Writing model shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.04s/it]
[2026-03-27 09:58:32,450] [INFO] [axolotl.train.save_trained_model:355] [PID:64102] Model successfully saved to ./outputs/out