[2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB () [2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config: { "activation_offloading": false, "axolotl_config_path": "qwen3-sft-stmt-tk.yml", "base_model": "Qwen/Qwen3-8B", "base_model_config": "Qwen/Qwen3-8B", "batch_size": 16, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_90", "fp8": true, "n_gpu": 8, "n_node": 1 }, "chat_template": "qwen3", "chat_template_kwargs": { "enable_thinking": false }, "context_parallel_size": 1, "dataloader_num_workers": 8, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 192, "datasets": [ { "message_property_mappings": { "content": "content", "role": "role" }, "path": "xiaolesu/lean4-sft-stmt-tk", "split": "train", "trust_remote_code": false, "type": "alpaca" } ], "ddp": true, "device": "cuda:0", "device_map": { "": 0 }, "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.9.1" }, "eval_batch_size": 2, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_table_size": 0, "evals_per_epoch": 10, "experimental_skip_move_to_device": true, "flex_attention": true, "flex_attn_compile_kwargs": { "dynamic": false, "mode": "max-autotune-no-cudagraphs" }, "fp16": false, "fsdp": [ "full_shard", "auto_wrap" ], "fsdp_config": { "activation_checkpointing": true, "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "cpu_ram_efficient_loading": true, "fsdp_version": 2, "offload_params": false, "reshard_after_forward": true, "state_dict_type": "FULL_STATE_DICT", "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer" }, "fsdp_version": 2, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 1, "gradient_checkpointing": false, "include_tkps": true, "learning_rate": 1e-05, "liger_fused_linear_cross_entropy": true, "liger_glu_activation": true, "liger_layer_norm": true, "liger_rms_norm": true, "liger_rope": true, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": false, "local_rank": 0, "logging_steps": 5, "lora_dropout": 0.0, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, "micro_batch_size": 2, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_fused", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/qwen3-sft-stmt-tk/", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.liger.LigerPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.05, "save_total_limit": 3, "saves_per_epoch": 10, "sequence_len": 8192, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-8B", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "scale_rewards": true, "sync_ref_model": false, "use_vllm": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "use_wandb": true, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_name": "qwen3-8b-tk-run1", "wandb_project": "qwen3-sft-stmt-tk", "warmup_ratio": 0.1, "weight_decay": 0.0, "world_size": 8 } [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|> [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|> [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None [2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11 [2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets... [2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s] [2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None [2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205 [2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159 Dropping Invalid Sequences (8192) (num_proc=192): 0%| | 0/11192 [00:008192) (num_proc=192): 1%| | 59/11192 [00:02<06:34, 28.25 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 3%|▎ | 295/11192 [00:02<01:02, 175.65 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 6%|▌ | 649/11192 [00:02<00:23, 453.06 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 8%|▊ | 885/11192 [00:02<00:16, 634.46 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 10%|█ | 1121/11192 [00:02<00:11, 849.04 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 13%|█▎ | 1416/11192 [00:02<00:08, 1166.00 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 15%|█▌ | 1711/11192 [00:02<00:06, 1480.17 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 18%|█▊ | 2006/11192 [00:02<00:05, 1697.58 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 21%|██ | 2301/11192 [00:02<00:04, 1949.74 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 23%|██▎ | 2596/11192 [00:03<00:04, 2145.10 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 26%|██▌ | 2891/11192 [00:03<00:03, 2324.57 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 29%|██▉ | 3245/11192 [00:03<00:03, 2566.75 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 70%|██████▉ | 7828/11192 [00:03<00:00, 14035.00 examples/s] Dropping Invalid Sequences (8192) (num_proc=192): 100%|██████████| 11192/11192 [00:04<00:00, 2753.84 examples/s] [2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192]) Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/10830 [00:00 [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|> [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None [2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model [2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing... [2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True} Loading weights: 0%| | 0/399 [00:00