| [2026-03-16 19:06:45,455] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:213] baseline 0.000GB () |
| [2026-03-16 19:06:45,456] [INFO] [axolotl.cli.config.load_cfg:340] [PID:213] config: |
| { |
| "activation_offloading": false, |
| "axolotl_config_path": "qwen3-sft-stmt-tk.yml", |
| "base_model": "Qwen/Qwen3-8B", |
| "base_model_config": "Qwen/Qwen3-8B", |
| "batch_size": 16, |
| "bf16": true, |
| "capabilities": { |
| "bf16": true, |
| "compute_capability": "sm_90", |
| "fp8": true, |
| "n_gpu": 8, |
| "n_node": 1 |
| }, |
| "chat_template": "qwen3", |
| "chat_template_kwargs": { |
| "enable_thinking": false |
| }, |
| "context_parallel_size": 1, |
| "dataloader_num_workers": 8, |
| "dataloader_pin_memory": true, |
| "dataloader_prefetch_factor": 256, |
| "dataset_num_proc": 192, |
| "datasets": [ |
| { |
| "message_property_mappings": { |
| "content": "content", |
| "role": "role" |
| }, |
| "path": "xiaolesu/lean4-sft-stmt-tk", |
| "split": "train", |
| "trust_remote_code": false, |
| "type": "alpaca" |
| } |
| ], |
| "ddp": true, |
| "device": "cuda:0", |
| "device_map": { |
| "": 0 |
| }, |
| "dion_rank_fraction": 1.0, |
| "dion_rank_multiple_of": 1, |
| "eaft_alpha": 1.0, |
| "eaft_k": 20, |
| "env_capabilities": { |
| "torch_version": "2.9.1" |
| }, |
| "eval_batch_size": 2, |
| "eval_causal_lm_metrics": [ |
| "sacrebleu", |
| "comet", |
| "ter", |
| "chrf" |
| ], |
| "eval_max_new_tokens": 128, |
| "eval_sample_packing": true, |
| "eval_table_size": 0, |
| "evals_per_epoch": 10, |
| "experimental_skip_move_to_device": true, |
| "flex_attention": true, |
| "flex_attn_compile_kwargs": { |
| "dynamic": false, |
| "mode": "max-autotune-no-cudagraphs" |
| }, |
| "fp16": false, |
| "fsdp": [ |
| "full_shard", |
| "auto_wrap" |
| ], |
| "fsdp_config": { |
| "activation_checkpointing": true, |
| "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", |
| "cpu_ram_efficient_loading": true, |
| "fsdp_version": 2, |
| "offload_params": false, |
| "reshard_after_forward": true, |
| "state_dict_type": "FULL_STATE_DICT", |
| "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer" |
| }, |
| "fsdp_version": 2, |
| "generate_samples": false, |
| "generation_do_sample": true, |
| "generation_max_new_tokens": 50, |
| "generation_prompt_ratio": 0.5, |
| "generation_temperature": 0.7, |
| "gradient_accumulation_steps": 1, |
| "gradient_checkpointing": false, |
| "include_tkps": true, |
| "learning_rate": 1e-05, |
| "liger_fused_linear_cross_entropy": true, |
| "liger_glu_activation": true, |
| "liger_layer_norm": true, |
| "liger_rms_norm": true, |
| "liger_rope": true, |
| "lisa_layers_attribute": "model.layers", |
| "load_best_model_at_end": false, |
| "load_in_4bit": false, |
| "load_in_8bit": false, |
| "local_rank": 0, |
| "logging_steps": 5, |
| "lora_dropout": 0.0, |
| "loraplus_lr_embedding": 1e-06, |
| "lr_scheduler": "cosine", |
| "mean_resizing_embeddings": false, |
| "micro_batch_size": 2, |
| "model_config_type": "qwen3", |
| "num_epochs": 2.0, |
| "num_generation_samples": 3, |
| "optimizer": "adamw_torch_fused", |
| "otel_metrics_host": "localhost", |
| "otel_metrics_port": 8000, |
| "output_dir": "./outputs/qwen3-sft-stmt-tk/", |
| "pad_to_sequence_len": true, |
| "plugins": [ |
| "axolotl.integrations.liger.LigerPlugin" |
| ], |
| "pretrain_multipack_attn": true, |
| "profiler_steps_start": 0, |
| "qlora_sharded_model_loading": false, |
| "quantize_moe_experts": false, |
| "ray_num_workers": 1, |
| "resources_per_worker": { |
| "GPU": 1 |
| }, |
| "sample_packing": true, |
| "sample_packing_bin_size": 200, |
| "sample_packing_group_size": 100000, |
| "save_only_model": false, |
| "save_safetensors": true, |
| "save_steps": 0.05, |
| "save_total_limit": 3, |
| "saves_per_epoch": 10, |
| "sequence_len": 8192, |
| "shuffle_before_merging_datasets": false, |
| "shuffle_merged_datasets": true, |
| "skip_prepare_dataset": false, |
| "streaming_multipack_buffer_size": 10000, |
| "strict": false, |
| "tensor_parallel_size": 1, |
| "tf32": true, |
| "tiled_mlp_use_original_mlp": true, |
| "tokenizer_config": "Qwen/Qwen3-8B", |
| "tokenizer_save_jinja_files": true, |
| "torch_dtype": "torch.bfloat16", |
| "train_on_inputs": false, |
| "trl": { |
| "log_completions": false, |
| "mask_truncated_completions": false, |
| "ref_model_mixup_alpha": 0.9, |
| "ref_model_sync_steps": 64, |
| "scale_rewards": true, |
| "sync_ref_model": false, |
| "use_vllm": false, |
| "vllm_server_host": "0.0.0.0", |
| "vllm_server_port": 8000 |
| }, |
| "use_otel_metrics": false, |
| "use_ray": false, |
| "use_wandb": true, |
| "val_set_size": 0.0, |
| "vllm": { |
| "device": "auto", |
| "dtype": "auto", |
| "gpu_memory_utilization": 0.9, |
| "host": "0.0.0.0", |
| "port": 8000 |
| }, |
| "wandb_name": "qwen3-8b-tk-run1", |
| "wandb_project": "qwen3-sft-stmt-tk", |
| "warmup_ratio": 0.1, |
| "weight_decay": 0.0, |
| "world_size": 8 |
| } |
| [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|> |
| [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None |
| [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|> |
| [2026-03-16 19:06:47,178] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None |
| [2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:480] [PID:213] Unable to find prepared dataset in last_run_prepared/a7f1540a69de94eaad2000d92fac4b11 |
| [2026-03-16 19:08:33,239] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:213] Loading raw datasets... |
| [2026-03-16 19:08:33,239] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:213] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. |
|
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 0 files: 0it [00:00, ?it/s] |
| [2026-03-16 19:08:34,675] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:213] Loading dataset: xiaolesu/lean4-sft-stmt-tk with base_type: alpaca and prompt_style: None |
| [2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:213] min_input_len: 205 |
| [2026-03-16 19:08:36,088] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:213] max_input_len: 9159 |
|
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 0%| | 0/11192 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 1%| | 59/11192 [00:02<06:34, 28.25 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 3%|β | 295/11192 [00:02<01:02, 175.65 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 6%|β | 649/11192 [00:02<00:23, 453.06 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 8%|β | 885/11192 [00:02<00:16, 634.46 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 10%|β | 1121/11192 [00:02<00:11, 849.04 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 13%|ββ | 1416/11192 [00:02<00:08, 1166.00 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 15%|ββ | 1711/11192 [00:02<00:06, 1480.17 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 18%|ββ | 2006/11192 [00:02<00:05, 1697.58 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 21%|ββ | 2301/11192 [00:02<00:04, 1949.74 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 23%|βββ | 2596/11192 [00:03<00:04, 2145.10 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 26%|βββ | 2891/11192 [00:03<00:03, 2324.57 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 29%|βββ | 3245/11192 [00:03<00:03, 2566.75 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 70%|βββββββ | 7828/11192 [00:03<00:00, 14035.00 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=192): 100%|ββββββββββ| 11192/11192 [00:04<00:00, 2753.84 examples/s] |
| [2026-03-16 19:08:41,123] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:213] Dropped 362 sequences outside valid range ([None, 8192]) |
|
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/10830 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 1%| | 57/10830 [00:02<06:27, 27.78 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 3%|β | 285/10830 [00:02<01:00, 173.64 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 4%|β | 456/10830 [00:02<00:34, 299.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 6%|β | 684/10830 [00:02<00:20, 506.62 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 8%|β | 912/10830 [00:02<00:13, 736.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 11%|β | 1140/10830 [00:02<00:10, 947.17 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 13%|ββ | 1368/10830 [00:02<00:08, 1094.03 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 15%|ββ | 1596/10830 [00:02<00:07, 1269.49 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 17%|ββ | 1824/10830 [00:02<00:06, 1437.65 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 19%|ββ | 2052/10830 [00:03<00:05, 1614.63 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 21%|ββ | 2280/10830 [00:03<00:05, 1635.72 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 23%|βββ | 2508/10830 [00:03<00:04, 1732.21 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 25%|βββ | 2736/10830 [00:03<00:04, 1721.60 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 27%|βββ | 2964/10830 [00:03<00:04, 1703.27 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 29%|βββ | 3192/10830 [00:03<00:04, 1798.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 32%|ββββ | 3477/10830 [00:03<00:03, 1958.86 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 34%|ββββ | 3705/10830 [00:03<00:03, 2037.08 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 36%|ββββ | 3933/10830 [00:04<00:03, 2067.96 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 38%|ββββ | 4161/10830 [00:04<00:03, 2091.19 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 41%|ββββ | 4389/10830 [00:04<00:05, 1127.36 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 43%|βββββ | 4670/10830 [00:04<00:04, 1385.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 45%|βββββ | 4894/10830 [00:04<00:04, 1432.10 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 69%|βββββββ | 7526/10830 [00:04<00:00, 6499.14 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|ββββββββββ| 10830/10830 [00:05<00:00, 1931.57 examples/s] |
|
Add position_id column (Sample Packing) (num_proc=192): 0%| | 0/10830 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=192): 1%| | 57/10830 [00:02<06:33, 27.40 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 2%|β | 228/10830 [00:02<01:18, 135.14 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 4%|β | 456/10830 [00:02<00:33, 310.31 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 8%|β | 912/10830 [00:02<00:14, 692.10 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 11%|β | 1140/10830 [00:02<00:11, 858.26 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 13%|ββ | 1368/10830 [00:02<00:09, 1027.56 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 15%|ββ | 1596/10830 [00:02<00:07, 1182.55 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 17%|ββ | 1881/10830 [00:02<00:06, 1425.26 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 20%|ββ | 2166/10830 [00:03<00:05, 1604.97 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 22%|βββ | 2394/10830 [00:03<00:04, 1738.29 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 25%|βββ | 2679/10830 [00:03<00:04, 1951.23 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 63%|βββββββ | 6854/10830 [00:03<00:00, 11681.66 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|ββββββββββ| 10830/10830 [00:04<00:00, 2621.72 examples/s] |
|
Saving the dataset (0/42 shards): 0%| | 0/10830 [00:00<?, ? examples/s]
Saving the dataset (0/42 shards): 2%|β | 258/10830 [00:00<00:22, 464.02 examples/s]
Saving the dataset (1/42 shards): 2%|β | 258/10830 [00:00<00:22, 464.02 examples/s]
Saving the dataset (2/42 shards): 7%|β | 774/10830 [00:00<00:21, 464.02 examples/s]
Saving the dataset (3/42 shards): 7%|β | 774/10830 [00:00<00:21, 464.02 examples/s]
Saving the dataset (4/42 shards): 14%|ββ | 1548/10830 [00:00<00:20, 464.02 examples/s]
Saving the dataset (5/42 shards): 14%|ββ | 1548/10830 [00:00<00:20, 464.02 examples/s]
Saving the dataset (6/42 shards): 17%|ββ | 1806/10830 [00:00<00:19, 464.02 examples/s]
Saving the dataset (7/42 shards): 19%|ββ | 2064/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (8/42 shards): 21%|βββ | 2322/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (9/42 shards): 21%|βββ | 2322/10830 [00:00<00:18, 464.02 examples/s]
Saving the dataset (10/42 shards): 26%|βββ | 2838/10830 [00:00<00:17, 464.02 examples/s]
Saving the dataset (11/42 shards): 29%|βββ | 3096/10830 [00:00<00:16, 464.02 examples/s]
Saving the dataset (12/42 shards): 31%|βββ | 3354/10830 [00:00<00:16, 464.02 examples/s]
Saving the dataset (13/42 shards): 33%|ββββ | 3612/10830 [00:00<00:15, 464.02 examples/s]
Saving the dataset (14/42 shards): 33%|ββββ | 3612/10830 [00:00<00:15, 464.02 examples/s]
Saving the dataset (15/42 shards): 38%|ββββ | 4128/10830 [00:00<00:14, 464.02 examples/s]
Saving the dataset (16/42 shards): 40%|ββββ | 4386/10830 [00:00<00:13, 464.02 examples/s]
Saving the dataset (17/42 shards): 40%|ββββ | 4386/10830 [00:00<00:13, 464.02 examples/s]
Saving the dataset (18/42 shards): 45%|βββββ | 4902/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (19/42 shards): 48%|βββββ | 5160/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (20/42 shards): 48%|βββββ | 5160/10830 [00:00<00:12, 464.02 examples/s]
Saving the dataset (21/42 shards): 52%|ββββββ | 5676/10830 [00:00<00:11, 464.02 examples/s]
Saving the dataset (22/42 shards): 52%|ββββββ | 5676/10830 [00:00<00:11, 464.02 examples/s]
Saving the dataset (23/42 shards): 55%|ββββββ | 5934/10830 [00:00<00:10, 464.02 examples/s]
Saving the dataset (24/42 shards): 57%|ββββββ | 6192/10830 [00:00<00:09, 464.02 examples/s]
Saving the dataset (25/42 shards): 64%|βββββββ | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (26/42 shards): 64%|βββββββ | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (27/42 shards): 64%|βββββββ | 6966/10830 [00:00<00:08, 464.02 examples/s]
Saving the dataset (28/42 shards): 67%|βββββββ | 7224/10830 [00:00<00:07, 464.02 examples/s]
Saving the dataset (29/42 shards): 74%|ββββββββ | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (30/42 shards): 74%|ββββββββ | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (31/42 shards): 74%|ββββββββ | 7998/10830 [00:00<00:06, 464.02 examples/s]
Saving the dataset (32/42 shards): 79%|ββββββββ | 8514/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (33/42 shards): 81%|ββββββββ | 8772/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (34/42 shards): 81%|ββββββββ | 8772/10830 [00:00<00:04, 464.02 examples/s]
Saving the dataset (35/42 shards): 83%|βββββββββ | 9030/10830 [00:00<00:03, 464.02 examples/s]
Saving the dataset (36/42 shards): 88%|βββββββββ | 9545/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (37/42 shards): 88%|βββββββββ | 9545/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (38/42 shards): 91%|βββββββββ | 9802/10830 [00:00<00:02, 464.02 examples/s]
Saving the dataset (39/42 shards): 95%|ββββββββββ| 10316/10830 [00:00<00:01, 464.02 examples/s]
Saving the dataset (40/42 shards): 95%|ββββββββββ| 10316/10830 [00:00<00:01, 464.02 examples/s]
Saving the dataset (41/42 shards): 98%|ββββββββββ| 10573/10830 [00:00<00:00, 464.02 examples/s]
Saving the dataset (42/42 shards): 100%|ββββββββββ| 10830/10830 [00:00<00:00, 464.02 examples/s]
Saving the dataset (42/42 shards): 100%|ββββββββββ| 10830/10830 [00:00<00:00, 16314.56 examples/s] |
| [2026-03-16 19:08:54,045] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:213] total_num_tokens: 33_957_071 |
| [2026-03-16 19:08:54,340] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:213] `total_supervised_tokens: 32_028_150` |
| [2026-03-16 19:08:55,893] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.7050187587738037 |
| [2026-03-16 19:11:05,467] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2148, 2146, 2148, 2145, 2146, 2146, 2148, 2145] |
| [2026-03-16 19:11:06,172] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:213] data_loader_len: 268 |
| [2026-03-16 19:11:06,189] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:213] sample_packing_eff_est across ranks: [0.9646614789962769, 0.9657852649688721, 0.9646614789962769, 0.9657852649688721, 0.9648860096931458, 0.9648860096931458, 0.9653354287147522, 0.9657852649688721] |
| [2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:213] sample_packing_eff_est: 0.97 |
| [2026-03-16 19:11:06,190] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:213] total_num_steps: 536 |
| [2026-03-16 19:11:06,192] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:213] Maximum number of steps set at 536 |
| [2026-03-16 19:11:06,242] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:213] loading tokenizer... Qwen/Qwen3-8B |
| [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:213] EOS: 151645 / <|im_end|> |
| [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:213] BOS: None / None |
| [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:213] PAD: 151643 / <|endoftext|> |
| [2026-03-16 19:11:07,694] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:213] UNK: None / None |
| [2026-03-16 19:11:07,694] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:213] Loading model |
| [2026-03-16 19:11:07,808] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:213] Patched Trainer.evaluation_loop with nanmean loss calculation |
| [2026-03-16 19:11:07,809] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:213] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation |
| [2026-03-16 19:11:07,811] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:400] [PID:213] Applying multipack dataloader patch for sample packing... |
| [2026-03-16 19:11:09,375] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:104] [PID:213] Applying LIGER to qwen3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True} |
|
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 100%|ββββββββββ| 399/399 [00:00<00:00, 9671.84it/s] |
| [2026-03-16 19:11:09,882] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:359] [PID:213] Converting modules to torch.bfloat16 |
| [2026-03-16 19:11:09,885] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:213] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved) |
| [2026-03-16 19:11:11,696] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:213] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.Multiple deprecation warnings due to FSDP2 conversion: |
| sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None. |
| [2026-03-16 19:11:12,192] [INFO] [axolotl.train.save_initial_configs:417] [PID:213] Pre-saving tokenizer to ./outputs/qwen3-sft-stmt-tk/... |
| [2026-03-16 19:11:12,283] [INFO] [axolotl.train.save_initial_configs:422] [PID:213] Pre-saving model config to ./outputs/qwen3-sft-stmt-tk/... |
| [2026-03-16 19:11:12,286] [INFO] [axolotl.train.execute_training:218] [PID:213] Starting trainer... |
| [2026-03-16 19:11:14,793] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:213] generate_batches time: 0.9547648429870605 |
| [2026-03-16 19:11:14,796] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:213] gather_len_batches: [2103, 2104, 2104, 2104, 2103, 2104, 2106, 2104] |
| [2026-03-16 19:11:15,013] [INFO] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:34] [PID:213] Broadcasting full state dict to all ranks... |
| [2026-03-16 19:11:22,269] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.fsdp2_load_full_state_dict:86] [PID:213] Time taken to load full state dict: 7.26 seconds |
| [2026-03-16 19:11:22,270] [DEBUG] [axolotl.monkeypatch.accelerate.fsdp2.log_gpu_memory_usage:127] [PID:213] Memory usage after broadcasting full state dict 3.067GB (+3.067GB allocated, +3.178GB reserved) |
| wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY. |
| wandb: Currently logged in as: suxiaole0223 (suxiaole) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
| wandb: setting up run kje10pck |
| wandb: Tracking run with wandb version 0.25.1 |
| wandb: Run data is saved locally in /workspace/axolotl-workspace/wandb/run-20260316_191122-kje10pck |
| wandb: Run `wandb offline` to turn off syncing. |
| wandb: Syncing run qwen3-8b-tk-run1 |
| wandb: βοΈ View project at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk |
| wandb: π View run at https://wandb.ai/suxiaole/qwen3-sft-stmt-tk/runs/kje10pck |
| wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") |
| wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files. |
| [2026-03-16 19:11:25,554] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:213] The Axolotl config has been saved to the WandB run under files. |
|
0%| | 0/536 [00:00<?, ?it/s][2026-03-16 19:11:57,210] [WARNING] [py.warnings._showwarnmsg:110] [PID:213] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/nn/attention/flex_attention.py:1622: FutureWarning: return_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead. |
| _warn_once( |
|
|
|
0%| | 1/536 [00:40<6:03:21, 40.75s/it]
0%| | 2/536 [00:43<2:42:00, 18.20s/it]
1%| | 3/536 [00:45<1:37:15, 10.95s/it]
1%| | 4/536 [00:47<1:07:23, 7.60s/it]
1%| | 5/536 [00:50<50:28, 5.70s/it]
{'loss': '0.8667', 'grad_norm': '2.609', 'learning_rate': '7.547e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6531', 'tokens/total': 655360, 'tokens/trainable': 611049, 'epoch': '0.01908'} |
|
1%| | 5/536 [00:50<50:28, 5.70s/it]
1%| | 6/536 [00:52<40:15, 4.56s/it]
1%|β | 7/536 [00:55<34:02, 3.86s/it]
1%|β | 8/536 [00:57<30:00, 3.41s/it]
2%|β | 9/536 [00:59<26:45, 3.05s/it]
2%|β | 10/536 [01:02<24:45, 2.82s/it]
{'loss': '0.8307', 'grad_norm': '2.5', 'learning_rate': '1.698e-06', 'ppl': '2.295', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6647', 'tokens/total': 1310720, 'tokens/trainable': 1224548, 'epoch': '0.03817'} |
|
2%|β | 10/536 [01:02<24:45, 2.82s/it]
2%|β | 11/536 [01:04<23:13, 2.65s/it]
2%|β | 12/536 [01:06<22:04, 2.53s/it]
2%|β | 13/536 [01:08<21:32, 2.47s/it]
3%|β | 14/536 [01:11<21:27, 2.47s/it]
3%|β | 15/536 [01:13<21:28, 2.47s/it]
{'loss': '0.8487', 'grad_norm': '2.453', 'learning_rate': '2.642e-06', 'ppl': '2.337', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6160', 'tokens/total': 1966080, 'tokens/trainable': 1834432, 'epoch': '0.05725'} |
|
3%|β | 15/536 [01:13<21:28, 2.47s/it]
3%|β | 16/536 [01:16<21:18, 2.46s/it]
3%|β | 17/536 [01:18<20:51, 2.41s/it]
3%|β | 18/536 [01:20<20:44, 2.40s/it]
4%|β | 19/536 [01:23<21:59, 2.55s/it]
4%|β | 20/536 [01:26<21:40, 2.52s/it]
{'loss': '0.7713', 'grad_norm': '1.898', 'learning_rate': '3.585e-06', 'ppl': '2.163', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6256', 'tokens/total': 2621440, 'tokens/trainable': 2448388, 'epoch': '0.07634'} |
|
4%|β | 20/536 [01:26<21:40, 2.52s/it]
4%|β | 21/536 [01:28<21:23, 2.49s/it]
4%|β | 22/536 [01:31<20:49, 2.43s/it]
4%|β | 23/536 [01:33<20:37, 2.41s/it]
4%|β | 24/536 [01:35<20:37, 2.42s/it]
5%|β | 25/536 [01:38<20:01, 2.35s/it]
{'loss': '0.7452', 'grad_norm': '1.273', 'learning_rate': '4.528e-06', 'ppl': '2.107', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6954', 'tokens/total': 3276800, 'tokens/trainable': 3060985, 'epoch': '0.09542'} |
|
5%|β | 25/536 [01:38<20:01, 2.35s/it]
5%|β | 26/536 [01:40<19:41, 2.32s/it]
5%|β | 27/536 [01:42<19:26, 2.29s/it][2026-03-16 19:13:17,483] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-27 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.48s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.48s/it] |
|
5%|β | 28/536 [03:16<4:11:47, 29.74s/it]
5%|β | 29/536 [03:18<3:01:39, 21.50s/it]
6%|β | 30/536 [03:20<2:12:38, 15.73s/it]
{'loss': '0.718', 'grad_norm': '0.7695', 'learning_rate': '5.472e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6706', 'tokens/total': 3932160, 'tokens/trainable': 3670695, 'epoch': '0.1145'} |
|
6%|β | 30/536 [03:20<2:12:38, 15.73s/it]
6%|β | 31/536 [03:23<1:38:27, 11.70s/it]
6%|β | 32/536 [03:25<1:14:38, 8.89s/it]
6%|β | 33/536 [03:27<57:48, 6.90s/it]
6%|β | 34/536 [03:29<46:07, 5.51s/it]
7%|β | 35/536 [03:32<37:56, 4.54s/it]
{'loss': '0.6699', 'grad_norm': '0.6406', 'learning_rate': '6.415e-06', 'ppl': '1.954', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6770', 'tokens/total': 4587520, 'tokens/trainable': 4284736, 'epoch': '0.1336'} |
|
7%|β | 35/536 [03:32<37:56, 4.54s/it]
7%|β | 36/536 [03:34<32:19, 3.88s/it]
7%|β | 37/536 [03:37<28:45, 3.46s/it]
7%|β | 38/536 [03:39<26:05, 3.14s/it]
7%|β | 39/536 [03:41<24:10, 2.92s/it]
7%|β | 40/536 [03:44<22:31, 2.72s/it]
{'loss': '0.6393', 'grad_norm': '0.418', 'learning_rate': '7.358e-06', 'ppl': '1.895', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6668', 'tokens/total': 5242880, 'tokens/trainable': 4896504, 'epoch': '0.1527'} |
|
7%|β | 40/536 [03:44<22:31, 2.72s/it]
8%|β | 41/536 [03:46<21:24, 2.59s/it]
8%|β | 42/536 [03:48<20:36, 2.50s/it]
8%|β | 43/536 [03:51<20:06, 2.45s/it]
8%|β | 44/536 [03:53<19:38, 2.39s/it]
8%|β | 45/536 [03:55<19:17, 2.36s/it]
{'loss': '0.5953', 'grad_norm': '0.3594', 'learning_rate': '8.302e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 5898240, 'tokens/trainable': 5505933, 'epoch': '0.1718'} |
|
8%|β | 45/536 [03:55<19:17, 2.36s/it]
9%|β | 46/536 [03:57<19:17, 2.36s/it]
9%|β | 47/536 [04:00<19:01, 2.33s/it]
9%|β | 48/536 [04:02<19:02, 2.34s/it]
9%|β | 49/536 [04:04<19:02, 2.35s/it]
9%|β | 50/536 [04:07<18:55, 2.34s/it]
{'loss': '0.5779', 'grad_norm': '0.332', 'learning_rate': '9.245e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6574', 'tokens/total': 6553600, 'tokens/trainable': 6116643, 'epoch': '0.1908'} |
|
9%|β | 50/536 [04:07<18:55, 2.34s/it]
10%|β | 51/536 [04:09<18:46, 2.32s/it]
10%|β | 52/536 [04:11<18:33, 2.30s/it]
10%|β | 53/536 [04:14<18:19, 2.28s/it]
10%|β | 54/536 [04:16<18:23, 2.29s/it][2026-03-16 19:15:50,860] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-54 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.65s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.65s/it] |
|
10%|β | 55/536 [05:48<3:55:25, 29.37s/it]
{'loss': '0.5579', 'grad_norm': '0.2793', 'learning_rate': '1e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4847', 'tokens/total': 7208960, 'tokens/trainable': 6728061, 'epoch': '0.2099'} |
|
10%|β | 55/536 [05:48<3:55:25, 29.37s/it]
10%|β | 56/536 [05:51<2:50:56, 21.37s/it]
11%|β | 57/536 [05:54<2:05:27, 15.72s/it]
11%|β | 58/536 [05:56<1:33:09, 11.69s/it]
11%|β | 59/536 [05:58<1:11:13, 8.96s/it]
11%|β | 60/536 [06:01<55:01, 6.94s/it]
{'loss': '0.5485', 'grad_norm': '0.2773', 'learning_rate': '9.996e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 7864320, 'tokens/trainable': 7336524, 'epoch': '0.229'} |
|
11%|β | 60/536 [06:01<55:01, 6.94s/it]
11%|ββ | 61/536 [06:03<43:45, 5.53s/it]
12%|ββ | 62/536 [06:05<36:05, 4.57s/it]
12%|ββ | 63/536 [06:08<30:31, 3.87s/it]
12%|ββ | 64/536 [06:10<26:38, 3.39s/it]
12%|ββ | 65/536 [06:12<24:01, 3.06s/it]
{'loss': '0.5385', 'grad_norm': '0.2734', 'learning_rate': '9.987e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6565', 'tokens/total': 8519680, 'tokens/trainable': 7944984, 'epoch': '0.2481'} |
|
12%|ββ | 65/536 [06:12<24:01, 3.06s/it]
12%|ββ | 66/536 [06:14<22:09, 2.83s/it]
12%|ββ | 67/536 [06:17<21:06, 2.70s/it]
13%|ββ | 68/536 [06:19<20:07, 2.58s/it]
13%|ββ | 69/536 [06:21<19:21, 2.49s/it]
13%|ββ | 70/536 [06:24<19:00, 2.45s/it]
{'loss': '0.5197', 'grad_norm': '0.2578', 'learning_rate': '9.973e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 9175040, 'tokens/trainable': 8556200, 'epoch': '0.2672'} |
|
13%|ββ | 70/536 [06:24<19:00, 2.45s/it]
13%|ββ | 71/536 [06:26<18:33, 2.39s/it]
13%|ββ | 72/536 [06:28<18:14, 2.36s/it]
14%|ββ | 73/536 [06:31<18:01, 2.34s/it]
14%|ββ | 74/536 [06:33<18:02, 2.34s/it]
14%|ββ | 75/536 [06:35<17:54, 2.33s/it]
{'loss': '0.5316', 'grad_norm': '0.3008', 'learning_rate': '9.953e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6728', 'tokens/total': 9830400, 'tokens/trainable': 9167282, 'epoch': '0.2863'} |
|
14%|ββ | 75/536 [06:35<17:54, 2.33s/it]
14%|ββ | 76/536 [06:38<17:54, 2.34s/it]
14%|ββ | 77/536 [06:40<18:05, 2.37s/it]
15%|ββ | 78/536 [06:43<18:28, 2.42s/it]
15%|ββ | 79/536 [06:45<18:05, 2.37s/it]
15%|ββ | 80/536 [06:47<17:47, 2.34s/it]
{'loss': '0.5154', 'grad_norm': '0.3164', 'learning_rate': '9.929e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6730', 'tokens/total': 10485760, 'tokens/trainable': 9774908, 'epoch': '0.3053'} |
|
15%|ββ | 80/536 [06:47<17:47, 2.34s/it]
15%|ββ | 81/536 [06:49<17:39, 2.33s/it][2026-03-16 19:18:24,375] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-81 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.37s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.37s/it] |
|
15%|ββ | 82/536 [08:22<3:43:29, 29.54s/it]
15%|ββ | 83/536 [08:25<2:41:12, 21.35s/it]
16%|ββ | 84/536 [08:27<1:57:43, 15.63s/it]
16%|ββ | 85/536 [08:29<1:27:29, 11.64s/it]
{'loss': '0.5143', 'grad_norm': '0.2363', 'learning_rate': '9.899e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6604', 'tokens/total': 11141120, 'tokens/trainable': 10388109, 'epoch': '0.3244'} |
|
16%|ββ | 85/536 [08:29<1:27:29, 11.64s/it]
16%|ββ | 86/536 [08:32<1:06:16, 8.84s/it]
16%|ββ | 87/536 [08:34<51:19, 6.86s/it]
16%|ββ | 88/536 [08:36<40:55, 5.48s/it]
17%|ββ | 89/536 [08:38<33:37, 4.51s/it]
17%|ββ | 90/536 [08:41<28:43, 3.86s/it]
{'loss': '0.4957', 'grad_norm': '0.2412', 'learning_rate': '9.864e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 11796480, 'tokens/trainable': 10999678, 'epoch': '0.3435'} |
|
17%|ββ | 90/536 [08:41<28:43, 3.86s/it]
17%|ββ | 91/536 [08:43<25:17, 3.41s/it]
17%|ββ | 92/536 [08:45<22:49, 3.08s/it]
17%|ββ | 93/536 [08:48<21:12, 2.87s/it]
18%|ββ | 94/536 [08:50<19:44, 2.68s/it]
18%|ββ | 95/536 [08:52<19:22, 2.64s/it]
{'loss': '0.509', 'grad_norm': '0.2236', 'learning_rate': '9.823e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5974', 'tokens/total': 12451840, 'tokens/trainable': 11609345, 'epoch': '0.3626'} |
|
18%|ββ | 95/536 [08:52<19:22, 2.64s/it]
18%|ββ | 96/536 [08:55<18:35, 2.54s/it]
18%|ββ | 97/536 [08:57<18:01, 2.46s/it]
18%|ββ | 98/536 [09:00<19:09, 2.62s/it]
18%|ββ | 99/536 [09:03<19:00, 2.61s/it]
19%|ββ | 100/536 [09:05<18:16, 2.51s/it]
{'loss': '0.4925', 'grad_norm': '0.2451', 'learning_rate': '9.778e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6696', 'tokens/total': 13107200, 'tokens/trainable': 12218448, 'epoch': '0.3817'} |
|
19%|ββ | 100/536 [09:05<18:16, 2.51s/it]
19%|ββ | 101/536 [09:07<17:50, 2.46s/it]
19%|ββ | 102/536 [09:09<17:19, 2.40s/it]
19%|ββ | 103/536 [09:12<16:59, 2.35s/it]
19%|ββ | 104/536 [09:14<16:47, 2.33s/it]
20%|ββ | 105/536 [09:16<16:34, 2.31s/it]
{'loss': '0.5051', 'grad_norm': '0.25', 'learning_rate': '9.727e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6724', 'tokens/total': 13762560, 'tokens/trainable': 12826468, 'epoch': '0.4008'} |
|
20%|ββ | 105/536 [09:16<16:34, 2.31s/it]
20%|ββ | 106/536 [09:19<16:28, 2.30s/it]
20%|ββ | 107/536 [09:21<16:26, 2.30s/it]
20%|ββ | 108/536 [09:23<16:27, 2.31s/it][2026-03-16 19:20:58,221] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-108 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.13s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.13s/it] |
|
20%|ββ | 109/536 [11:03<3:44:22, 31.53s/it]
21%|ββ | 110/536 [11:05<2:41:43, 22.78s/it]
{'loss': '0.4725', 'grad_norm': '0.2266', 'learning_rate': '9.672e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6506', 'tokens/total': 14417920, 'tokens/trainable': 13440042, 'epoch': '0.4198'} |
|
21%|ββ | 110/536 [11:05<2:41:43, 22.78s/it]
21%|ββ | 111/536 [11:07<1:57:41, 16.61s/it]
21%|ββ | 112/536 [11:10<1:26:54, 12.30s/it]
21%|ββ | 113/536 [11:12<1:05:42, 9.32s/it]
21%|βββ | 114/536 [11:14<50:52, 7.23s/it]
21%|βββ | 115/536 [11:17<40:26, 5.76s/it]
{'loss': '0.5004', 'grad_norm': '0.2256', 'learning_rate': '9.612e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6525', 'tokens/total': 15073280, 'tokens/trainable': 14049913, 'epoch': '0.4389'} |
|
21%|βββ | 115/536 [11:17<40:26, 5.76s/it]
22%|βββ | 116/536 [11:19<33:04, 4.72s/it]
22%|βββ | 117/536 [11:22<28:43, 4.11s/it]
22%|βββ | 118/536 [11:24<24:56, 3.58s/it]
22%|βββ | 119/536 [11:26<22:09, 3.19s/it]
22%|βββ | 120/536 [11:29<20:22, 2.94s/it]
{'loss': '0.4727', 'grad_norm': '0.248', 'learning_rate': '9.546e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6422', 'tokens/total': 15728640, 'tokens/trainable': 14657396, 'epoch': '0.458'} |
|
22%|βββ | 120/536 [11:29<20:22, 2.94s/it]
23%|βββ | 121/536 [11:31<19:02, 2.75s/it]
23%|βββ | 122/536 [11:33<18:01, 2.61s/it]
23%|βββ | 123/536 [11:36<17:25, 2.53s/it]
23%|βββ | 124/536 [11:38<17:02, 2.48s/it]
23%|βββ | 125/536 [11:40<16:31, 2.41s/it]
{'loss': '0.4808', 'grad_norm': '0.2344', 'learning_rate': '9.476e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6830', 'tokens/total': 16384000, 'tokens/trainable': 15266794, 'epoch': '0.4771'} |
|
23%|βββ | 125/536 [11:40<16:31, 2.41s/it]
24%|βββ | 126/536 [11:43<16:21, 2.39s/it]
24%|βββ | 127/536 [11:45<16:21, 2.40s/it]
24%|βββ | 128/536 [11:47<16:06, 2.37s/it]
24%|βββ | 129/536 [11:50<15:59, 2.36s/it]
24%|βββ | 130/536 [11:52<15:57, 2.36s/it]
{'loss': '0.4726', 'grad_norm': '0.2451', 'learning_rate': '9.401e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6471', 'tokens/total': 17039360, 'tokens/trainable': 15876387, 'epoch': '0.4962'} |
|
24%|βββ | 130/536 [11:52<15:57, 2.36s/it]
24%|βββ | 131/536 [11:54<15:48, 2.34s/it]
25%|βββ | 132/536 [11:57<15:37, 2.32s/it]
25%|βββ | 133/536 [11:59<15:31, 2.31s/it]
25%|βββ | 134/536 [12:01<15:54, 2.37s/it]
25%|βββ | 135/536 [12:04<16:09, 2.42s/it]
{'loss': '0.4864', 'grad_norm': '0.2344', 'learning_rate': '9.322e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6056', 'tokens/total': 17694720, 'tokens/trainable': 16486440, 'epoch': '0.5153'} |
|
25%|βββ | 135/536 [12:04<16:09, 2.42s/it][2026-03-16 19:23:38,988] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-135 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.41s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.41s/it] |
|
25%|βββ | 136/536 [13:41<3:26:11, 30.93s/it]
26%|βββ | 137/536 [13:45<2:30:52, 22.69s/it]
26%|βββ | 138/536 [13:47<1:49:51, 16.56s/it]
26%|βββ | 139/536 [13:49<1:21:09, 12.27s/it]
26%|βββ | 140/536 [13:52<1:01:08, 9.26s/it]
{'loss': '0.4817', 'grad_norm': '0.2275', 'learning_rate': '9.238e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6712', 'tokens/total': 18350080, 'tokens/trainable': 17095060, 'epoch': '0.5344'} |
|
26%|βββ | 140/536 [13:52<1:01:08, 9.26s/it]
26%|βββ | 141/536 [13:54<47:09, 7.16s/it]
26%|βββ | 142/536 [13:56<37:27, 5.70s/it]
27%|βββ | 143/536 [13:58<30:36, 4.67s/it]
27%|βββ | 144/536 [14:01<25:57, 3.97s/it]
27%|βββ | 145/536 [14:03<22:36, 3.47s/it]
{'loss': '0.4827', 'grad_norm': '0.249', 'learning_rate': '9.149e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6633', 'tokens/total': 19005440, 'tokens/trainable': 17703368, 'epoch': '0.5534'} |
|
27%|βββ | 145/536 [14:03<22:36, 3.47s/it]
27%|βββ | 146/536 [14:05<20:19, 3.13s/it]
27%|βββ | 147/536 [14:08<18:42, 2.89s/it]
28%|βββ | 148/536 [14:10<17:44, 2.74s/it]
28%|βββ | 149/536 [14:12<16:47, 2.60s/it]
28%|βββ | 150/536 [14:15<16:15, 2.53s/it]
{'loss': '0.4892', 'grad_norm': '0.2217', 'learning_rate': '9.057e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6466', 'tokens/total': 19660800, 'tokens/trainable': 18311084, 'epoch': '0.5725'} |
|
28%|βββ | 150/536 [14:15<16:15, 2.53s/it]
28%|βββ | 151/536 [14:17<15:49, 2.47s/it]
28%|βββ | 152/536 [14:20<15:51, 2.48s/it]
29%|βββ | 153/536 [14:22<16:18, 2.55s/it]
29%|βββ | 154/536 [14:25<16:18, 2.56s/it]
29%|βββ | 155/536 [14:27<16:04, 2.53s/it]
{'loss': '0.4618', 'grad_norm': '0.2236', 'learning_rate': '8.959e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6104', 'tokens/total': 20316160, 'tokens/trainable': 18920000, 'epoch': '0.5916'} |
|
29%|βββ | 155/536 [14:27<16:04, 2.53s/it]
29%|βββ | 156/536 [14:30<15:36, 2.47s/it]
29%|βββ | 157/536 [14:32<15:16, 2.42s/it]
29%|βββ | 158/536 [14:35<15:24, 2.45s/it]
30%|βββ | 159/536 [14:37<15:05, 2.40s/it]
30%|βββ | 160/536 [14:39<14:54, 2.38s/it]
{'loss': '0.471', 'grad_norm': '0.2793', 'learning_rate': '8.858e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6516', 'tokens/total': 20971520, 'tokens/trainable': 19529720, 'epoch': '0.6107'} |
|
30%|βββ | 160/536 [14:39<14:54, 2.38s/it]
30%|βββ | 161/536 [14:41<14:48, 2.37s/it]
30%|βββ | 162/536 [14:44<14:28, 2.32s/it][2026-03-16 19:26:18,649] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-162 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.63s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.63s/it] |
|
30%|βββ | 163/536 [16:21<3:11:06, 30.74s/it]
31%|βββ | 164/536 [16:23<2:17:38, 22.20s/it]
31%|βββ | 165/536 [16:25<1:40:18, 16.22s/it]
{'loss': '0.4703', 'grad_norm': '0.2383', 'learning_rate': '8.752e-06', 'ppl': '1.6', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6785', 'tokens/total': 21626880, 'tokens/trainable': 20137712, 'epoch': '0.6298'} |
|
31%|βββ | 165/536 [16:25<1:40:18, 16.22s/it]
31%|βββ | 166/536 [16:28<1:14:13, 12.04s/it]
31%|βββ | 167/536 [16:30<56:04, 9.12s/it]
31%|ββββ | 168/536 [16:32<43:22, 7.07s/it]
32%|ββββ | 169/536 [16:34<34:28, 5.64s/it]
32%|ββββ | 170/536 [16:37<28:13, 4.63s/it]
{'loss': '0.4727', 'grad_norm': '0.2139', 'learning_rate': '8.643e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6694', 'tokens/total': 22282240, 'tokens/trainable': 20749040, 'epoch': '0.6489'} |
|
32%|ββββ | 170/536 [16:37<28:13, 4.63s/it]
32%|ββββ | 171/536 [16:39<24:07, 3.96s/it]
32%|ββββ | 172/536 [16:42<21:19, 3.51s/it]
32%|ββββ | 173/536 [16:44<19:34, 3.24s/it]
32%|ββββ | 174/536 [16:46<17:46, 2.95s/it]
33%|ββββ | 175/536 [16:49<16:24, 2.73s/it]
{'loss': '0.4856', 'grad_norm': '0.2119', 'learning_rate': '8.53e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6812', 'tokens/total': 22937600, 'tokens/trainable': 21358216, 'epoch': '0.6679'} |
|
33%|ββββ | 175/536 [16:49<16:24, 2.73s/it]
33%|ββββ | 176/536 [16:51<15:32, 2.59s/it]
33%|ββββ | 177/536 [16:54<16:38, 2.78s/it]
33%|ββββ | 178/536 [16:57<15:52, 2.66s/it]
33%|ββββ | 179/536 [16:59<15:07, 2.54s/it]
34%|ββββ | 180/536 [17:01<14:47, 2.49s/it]
{'loss': '0.4551', 'grad_norm': '0.2266', 'learning_rate': '8.413e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6375', 'tokens/total': 23592960, 'tokens/trainable': 21963408, 'epoch': '0.687'} |
|
34%|ββββ | 180/536 [17:01<14:47, 2.49s/it]
34%|ββββ | 181/536 [17:04<14:25, 2.44s/it]
34%|ββββ | 182/536 [17:06<14:08, 2.40s/it]
34%|ββββ | 183/536 [17:08<13:47, 2.34s/it]
34%|ββββ | 184/536 [17:10<13:42, 2.34s/it]
35%|ββββ | 185/536 [17:13<13:34, 2.32s/it]
{'loss': '0.4654', 'grad_norm': '0.2695', 'learning_rate': '8.292e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6688', 'tokens/total': 24248320, 'tokens/trainable': 22570984, 'epoch': '0.7061'} |
|
35%|ββββ | 185/536 [17:13<13:34, 2.32s/it]
35%|ββββ | 186/536 [17:15<13:21, 2.29s/it]
35%|ββββ | 187/536 [17:17<13:19, 2.29s/it]
35%|ββββ | 188/536 [17:19<13:11, 2.28s/it]
35%|ββββ | 189/536 [17:22<13:08, 2.27s/it][2026-03-16 19:28:56,617] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-189 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.04s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.04s/it] |
|
35%|ββββ | 190/536 [19:01<3:00:49, 31.36s/it]
{'loss': '0.4727', 'grad_norm': '0.2285', 'learning_rate': '8.168e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4795', 'tokens/total': 24903680, 'tokens/trainable': 23180680, 'epoch': '0.7252'} |
|
35%|ββββ | 190/536 [19:01<3:00:49, 31.36s/it]
36%|ββββ | 191/536 [19:03<2:10:15, 22.65s/it]
36%|ββββ | 192/536 [19:06<1:35:23, 16.64s/it]
36%|ββββ | 193/536 [19:08<1:11:08, 12.44s/it]
36%|ββββ | 194/536 [19:11<53:42, 9.42s/it]
36%|ββββ | 195/536 [19:13<41:27, 7.30s/it]
{'loss': '0.462', 'grad_norm': '0.2158', 'learning_rate': '8.041e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6567', 'tokens/total': 25559040, 'tokens/trainable': 23790730, 'epoch': '0.7443'} |
|
36%|ββββ | 195/536 [19:13<41:27, 7.30s/it]
37%|ββββ | 196/536 [19:16<32:54, 5.81s/it]
37%|ββββ | 197/536 [19:18<27:50, 4.93s/it]
37%|ββββ | 198/536 [19:21<23:17, 4.13s/it]
37%|ββββ | 199/536 [19:23<20:04, 3.57s/it]
37%|ββββ | 200/536 [19:25<17:43, 3.17s/it]
{'loss': '0.4676', 'grad_norm': '0.2188', 'learning_rate': '7.91e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6880', 'tokens/total': 26214400, 'tokens/trainable': 24401252, 'epoch': '0.7634'} |
|
37%|ββββ | 200/536 [19:25<17:43, 3.17s/it]
38%|ββββ | 201/536 [19:27<16:11, 2.90s/it]
38%|ββββ | 202/536 [19:30<15:05, 2.71s/it]
38%|ββββ | 203/536 [19:32<14:15, 2.57s/it]
38%|ββββ | 204/536 [19:34<13:41, 2.47s/it]
38%|ββββ | 205/536 [19:37<13:22, 2.43s/it]
{'loss': '0.4504', 'grad_norm': '0.2158', 'learning_rate': '7.776e-06', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6617', 'tokens/total': 26869760, 'tokens/trainable': 25010696, 'epoch': '0.7824'} |
|
38%|ββββ | 205/536 [19:37<13:22, 2.43s/it]
38%|ββββ | 206/536 [19:39<13:08, 2.39s/it]
39%|ββββ | 207/536 [19:41<12:51, 2.34s/it]
39%|ββββ | 208/536 [19:43<12:41, 2.32s/it]
39%|ββββ | 209/536 [19:46<12:40, 2.32s/it]
39%|ββββ | 210/536 [19:48<13:00, 2.39s/it]
{'loss': '0.4614', 'grad_norm': '0.2295', 'learning_rate': '7.639e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5994', 'tokens/total': 27525120, 'tokens/trainable': 25617872, 'epoch': '0.8015'} |
|
39%|ββββ | 210/536 [19:48<13:00, 2.39s/it]
39%|ββββ | 211/536 [19:51<13:14, 2.44s/it]
40%|ββββ | 212/536 [19:53<12:58, 2.40s/it]
40%|ββββ | 213/536 [19:55<12:42, 2.36s/it]
40%|ββββ | 214/536 [19:58<12:29, 2.33s/it]
40%|ββββ | 215/536 [20:00<12:22, 2.31s/it]
{'loss': '0.477', 'grad_norm': '0.2412', 'learning_rate': '7.5e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6697', 'tokens/total': 28180480, 'tokens/trainable': 26227438, 'epoch': '0.8206'} |
|
40%|ββββ | 215/536 [20:00<12:22, 2.31s/it]
40%|ββββ | 216/536 [20:02<12:22, 2.32s/it][2026-03-16 19:31:37,309] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-216 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.85s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.85s/it] |
|
40%|ββββ | 217/536 [21:40<2:45:11, 31.07s/it]
41%|ββββ | 218/536 [21:43<1:59:15, 22.50s/it]
41%|ββββ | 219/536 [21:45<1:26:49, 16.43s/it]
41%|ββββ | 220/536 [21:47<1:04:08, 12.18s/it]
{'loss': '0.4535', 'grad_norm': '0.2148', 'learning_rate': '7.358e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 28835840, 'tokens/trainable': 26833456, 'epoch': '0.8397'} |
|
41%|ββββ | 220/536 [21:47<1:04:08, 12.18s/it]
41%|ββββ | 221/536 [21:50<48:25, 9.22s/it]
41%|βββββ | 222/536 [21:52<37:25, 7.15s/it]
42%|βββββ | 223/536 [21:54<29:44, 5.70s/it]
42%|βββββ | 224/536 [21:57<24:17, 4.67s/it]
42%|βββββ | 225/536 [21:59<20:30, 3.96s/it]
{'loss': '0.4639', 'grad_norm': '0.2197', 'learning_rate': '7.213e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6762', 'tokens/total': 29491200, 'tokens/trainable': 27444416, 'epoch': '0.8588'} |
|
42%|βββββ | 225/536 [21:59<20:30, 3.96s/it]
42%|βββββ | 226/536 [22:01<17:50, 3.45s/it]
42%|βββββ | 227/536 [22:03<15:56, 3.10s/it]
43%|βββββ | 228/536 [22:06<14:40, 2.86s/it]
43%|βββββ | 229/536 [22:08<14:24, 2.82s/it]
43%|βββββ | 230/536 [22:11<14:01, 2.75s/it]
{'loss': '0.4578', 'grad_norm': '0.2217', 'learning_rate': '7.066e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5816', 'tokens/total': 30146560, 'tokens/trainable': 28048432, 'epoch': '0.8779'} |
|
43%|βββββ | 230/536 [22:11<14:01, 2.75s/it]
43%|βββββ | 231/536 [22:13<13:21, 2.63s/it]
43%|βββββ | 232/536 [22:16<12:45, 2.52s/it]
43%|βββββ | 233/536 [22:18<12:29, 2.47s/it]
44%|βββββ | 234/536 [22:20<12:15, 2.44s/it]
44%|βββββ | 235/536 [22:23<11:59, 2.39s/it]
{'loss': '0.4497', 'grad_norm': '0.2354', 'learning_rate': '6.917e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6580', 'tokens/total': 30801920, 'tokens/trainable': 28655952, 'epoch': '0.8969'} |
|
44%|βββββ | 235/536 [22:23<11:59, 2.39s/it]
44%|βββββ | 236/536 [22:25<11:55, 2.39s/it]
44%|βββββ | 237/536 [22:27<11:47, 2.37s/it]
44%|βββββ | 238/536 [22:30<11:38, 2.34s/it]
45%|βββββ | 239/536 [22:32<11:38, 2.35s/it]
45%|βββββ | 240/536 [22:34<11:23, 2.31s/it]
{'loss': '0.4693', 'grad_norm': '0.2275', 'learning_rate': '6.766e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6813', 'tokens/total': 31457280, 'tokens/trainable': 29262050, 'epoch': '0.916'} |
|
45%|βββββ | 240/536 [22:34<11:23, 2.31s/it]
45%|βββββ | 241/536 [22:37<11:19, 2.30s/it]
45%|βββββ | 242/536 [22:39<11:12, 2.29s/it]
45%|βββββ | 243/536 [22:41<11:10, 2.29s/it][2026-03-16 19:34:16,197] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-243 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.81s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.82s/it] |
|
46%|βββββ | 244/536 [24:21<2:33:31, 31.55s/it]
46%|βββββ | 245/536 [24:23<1:50:23, 22.76s/it]
{'loss': '0.4629', 'grad_norm': '0.2178', 'learning_rate': '6.613e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 32112640, 'tokens/trainable': 29868356, 'epoch': '0.9351'} |
|
46%|βββββ | 245/536 [24:23<1:50:23, 22.76s/it]
46%|βββββ | 246/536 [24:25<1:20:19, 16.62s/it]
46%|βββββ | 247/536 [24:28<59:24, 12.33s/it]
46%|βββββ | 248/536 [24:30<45:07, 9.40s/it]
46%|βββββ | 249/536 [24:33<34:46, 7.27s/it]
47%|βββββ | 250/536 [24:35<27:30, 5.77s/it]
{'loss': '0.474', 'grad_norm': '0.2539', 'learning_rate': '6.458e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6714', 'tokens/total': 32768000, 'tokens/trainable': 30473100, 'epoch': '0.9542'} |
|
47%|βββββ | 250/536 [24:35<27:30, 5.77s/it]
47%|βββββ | 251/536 [24:37<22:25, 4.72s/it]
47%|βββββ | 252/536 [24:39<18:50, 3.98s/it]
47%|βββββ | 253/536 [24:42<16:27, 3.49s/it]
47%|βββββ | 254/536 [24:44<14:46, 3.14s/it]
48%|βββββ | 255/536 [24:46<13:26, 2.87s/it]
{'loss': '0.467', 'grad_norm': '0.2305', 'learning_rate': '6.302e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6772', 'tokens/total': 33423360, 'tokens/trainable': 31078478, 'epoch': '0.9733'} |
|
48%|βββββ | 255/536 [24:46<13:26, 2.87s/it]
48%|βββββ | 256/536 [24:49<12:32, 2.69s/it]
48%|βββββ | 257/536 [24:51<11:59, 2.58s/it]
48%|βββββ | 258/536 [24:53<11:39, 2.52s/it]
48%|βββββ | 259/536 [24:56<11:19, 2.45s/it]
49%|βββββ | 260/536 [24:58<11:02, 2.40s/it]
{'loss': '0.4511', 'grad_norm': '0.2148', 'learning_rate': '6.144e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6619', 'tokens/total': 34078720, 'tokens/trainable': 31682612, 'epoch': '0.9924'} |
|
49%|βββββ | 260/536 [24:58<11:02, 2.40s/it]
49%|βββββ | 261/536 [25:00<10:50, 2.36s/it]
49%|βββββ | 262/536 [25:03<10:56, 2.40s/it]
49%|βββββ | 263/536 [25:06<12:20, 2.71s/it]
49%|βββββ | 264/536 [25:08<11:41, 2.58s/it]
49%|βββββ | 265/536 [25:11<11:22, 2.52s/it]
{'loss': '0.4682', 'grad_norm': '0.2451', 'learning_rate': '5.985e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6472', 'tokens/total': 34734080, 'tokens/trainable': 32293470, 'epoch': '1.011'} |
|
49%|βββββ | 265/536 [25:11<11:22, 2.52s/it]
50%|βββββ | 266/536 [25:13<11:14, 2.50s/it]
50%|βββββ | 267/536 [25:16<11:28, 2.56s/it]
50%|βββββ | 268/536 [25:18<11:16, 2.52s/it]
50%|βββββ | 269/536 [25:21<10:53, 2.45s/it]
50%|βββββ | 270/536 [25:23<10:39, 2.40s/it]
{'loss': '0.461', 'grad_norm': '0.2207', 'learning_rate': '5.826e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6685', 'tokens/total': 35389440, 'tokens/trainable': 32904464, 'epoch': '1.031'} |
|
50%|βββββ | 270/536 [25:23<10:39, 2.40s/it][2026-03-16 19:36:59,256] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-270 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.30s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.30s/it] |
|
51%|βββββ | 271/536 [27:02<2:18:44, 31.41s/it]
51%|βββββ | 272/536 [27:04<1:39:50, 22.69s/it]
51%|βββββ | 273/536 [27:07<1:12:35, 16.56s/it]
51%|βββββ | 274/536 [27:09<53:33, 12.26s/it]
51%|ββββββ | 275/536 [27:11<40:23, 9.29s/it]
{'loss': '0.4545', 'grad_norm': '0.2324', 'learning_rate': '5.665e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6568', 'tokens/total': 36044800, 'tokens/trainable': 33517832, 'epoch': '1.05'} |
|
51%|ββββββ | 275/536 [27:11<40:23, 9.29s/it]
51%|ββββββ | 276/536 [27:15<32:37, 7.53s/it]
52%|ββββββ | 277/536 [27:17<25:45, 5.97s/it]
52%|ββββββ | 278/536 [27:19<20:59, 4.88s/it]
52%|ββββββ | 279/536 [27:22<17:35, 4.11s/it]
52%|ββββββ | 280/536 [27:24<15:12, 3.56s/it]
{'loss': '0.447', 'grad_norm': '0.2158', 'learning_rate': '5.503e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6754', 'tokens/total': 36700160, 'tokens/trainable': 34129632, 'epoch': '1.069'} |
|
52%|ββββββ | 280/536 [27:24<15:12, 3.56s/it]
52%|ββββββ | 281/536 [27:26<13:35, 3.20s/it]
53%|ββββββ | 282/536 [27:28<12:20, 2.91s/it]
53%|ββββββ | 283/536 [27:31<11:30, 2.73s/it]
53%|ββββββ | 284/536 [27:33<10:54, 2.60s/it]
53%|ββββββ | 285/536 [27:36<10:51, 2.60s/it]
{'loss': '0.4378', 'grad_norm': '0.2119', 'learning_rate': '5.341e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5854', 'tokens/total': 37355520, 'tokens/trainable': 34742888, 'epoch': '1.088'} |
|
53%|ββββββ | 285/536 [27:36<10:51, 2.60s/it]
53%|ββββββ | 286/536 [27:38<10:42, 2.57s/it]
54%|ββββββ | 287/536 [27:41<10:26, 2.51s/it]
54%|ββββββ | 288/536 [27:43<10:02, 2.43s/it]
54%|ββββββ | 289/536 [27:45<09:45, 2.37s/it]
54%|ββββββ | 290/536 [27:47<09:40, 2.36s/it]
{'loss': '0.4756', 'grad_norm': '0.2246', 'learning_rate': '5.179e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6534', 'tokens/total': 38010880, 'tokens/trainable': 35352848, 'epoch': '1.107'} |
|
54%|ββββββ | 290/536 [27:47<09:40, 2.36s/it]
54%|ββββββ | 291/536 [27:50<09:33, 2.34s/it]
54%|ββββββ | 292/536 [27:52<09:27, 2.33s/it]
55%|ββββββ | 293/536 [27:54<09:23, 2.32s/it]
55%|ββββββ | 294/536 [27:57<09:23, 2.33s/it]
55%|ββββββ | 295/536 [27:59<09:21, 2.33s/it]
{'loss': '0.4635', 'grad_norm': '0.2188', 'learning_rate': '5.016e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6541', 'tokens/total': 38666240, 'tokens/trainable': 35964736, 'epoch': '1.126'} |
|
55%|ββββββ | 295/536 [27:59<09:21, 2.33s/it]
55%|ββββββ | 296/536 [28:01<09:16, 2.32s/it]
55%|ββββββ | 297/536 [28:03<09:09, 2.30s/it][2026-03-16 19:39:38,467] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-297 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.82s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.82s/it] |
|
56%|ββββββ | 298/536 [29:43<2:05:14, 31.57s/it]
56%|ββββββ | 299/536 [29:46<1:30:06, 22.81s/it]
56%|ββββββ | 300/536 [29:48<1:05:28, 16.65s/it]
{'loss': '0.4578', 'grad_norm': '0.2334', 'learning_rate': '4.854e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6804', 'tokens/total': 39321600, 'tokens/trainable': 36579308, 'epoch': '1.145'} |
|
56%|ββββββ | 300/536 [29:48<1:05:28, 16.65s/it]
56%|ββββββ | 301/536 [29:50<48:19, 12.34s/it]
56%|ββββββ | 302/536 [29:53<36:18, 9.31s/it]
57%|ββββββ | 303/536 [29:55<27:59, 7.21s/it]
57%|ββββββ | 304/536 [29:57<22:22, 5.78s/it]
57%|ββββββ | 305/536 [30:00<18:50, 4.89s/it]
{'loss': '0.4526', 'grad_norm': '0.2129', 'learning_rate': '4.691e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6214', 'tokens/total': 39976960, 'tokens/trainable': 37187912, 'epoch': '1.164'} |
|
57%|ββββββ | 305/536 [30:00<18:50, 4.89s/it]
57%|ββββββ | 306/536 [30:02<15:44, 4.11s/it]
57%|ββββββ | 307/536 [30:05<13:31, 3.55s/it]
57%|ββββββ | 308/536 [30:07<12:06, 3.19s/it]
58%|ββββββ | 309/536 [30:09<11:00, 2.91s/it]
58%|ββββββ | 310/536 [30:12<10:18, 2.74s/it]
{'loss': '0.4482', 'grad_norm': '0.21', 'learning_rate': '4.529e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6526', 'tokens/total': 40632320, 'tokens/trainable': 37799984, 'epoch': '1.183'} |
|
58%|ββββββ | 310/536 [30:12<10:18, 2.74s/it]
58%|ββββββ | 311/536 [30:14<09:49, 2.62s/it]
58%|ββββββ | 312/536 [30:16<09:25, 2.53s/it]
58%|ββββββ | 313/536 [30:18<09:07, 2.45s/it]
59%|ββββββ | 314/536 [30:21<09:33, 2.58s/it]
59%|ββββββ | 315/536 [30:24<09:07, 2.48s/it]
{'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '4.368e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6899', 'tokens/total': 41287680, 'tokens/trainable': 38409832, 'epoch': '1.202'} |
|
59%|ββββββ | 315/536 [30:24<09:07, 2.48s/it]
59%|ββββββ | 316/536 [30:26<08:55, 2.43s/it]
59%|ββββββ | 317/536 [30:28<08:44, 2.39s/it]
59%|ββββββ | 318/536 [30:31<08:35, 2.36s/it]
60%|ββββββ | 319/536 [30:33<08:25, 2.33s/it]
60%|ββββββ | 320/536 [30:35<08:19, 2.31s/it]
{'loss': '0.4539', 'grad_norm': '0.2285', 'learning_rate': '4.207e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6757', 'tokens/total': 41943040, 'tokens/trainable': 39020096, 'epoch': '1.221'} |
|
60%|ββββββ | 320/536 [30:35<08:19, 2.31s/it]
60%|ββββββ | 321/536 [30:37<08:17, 2.31s/it]
60%|ββββββ | 322/536 [30:40<08:09, 2.29s/it]
60%|ββββββ | 323/536 [30:42<08:19, 2.35s/it]
60%|ββββββ | 324/536 [30:45<08:26, 2.39s/it][2026-03-16 19:42:20,100] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-324 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.49s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.49s/it] |
|
61%|ββββββ | 325/536 [32:22<1:48:54, 30.97s/it]
{'loss': '0.4481', 'grad_norm': '0.2246', 'learning_rate': '4.046e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6887', 'tokens/total': 42598400, 'tokens/trainable': 39629160, 'epoch': '1.24'} |
|
61%|ββββββ | 325/536 [32:22<1:48:54, 30.97s/it]
61%|ββββββ | 326/536 [32:24<1:18:13, 22.35s/it]
61%|ββββββ | 327/536 [32:27<56:54, 16.34s/it]
61%|ββββββ | 328/536 [32:29<42:00, 12.12s/it]
61%|βββββββ | 329/536 [32:31<31:43, 9.20s/it]
62%|βββββββ | 330/536 [32:34<24:27, 7.12s/it]
{'loss': '0.4542', 'grad_norm': '0.2256', 'learning_rate': '3.887e-06', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6731', 'tokens/total': 43253760, 'tokens/trainable': 40237288, 'epoch': '1.26'} |
|
62%|βββββββ | 330/536 [32:34<24:27, 7.12s/it]
62%|βββββββ | 331/536 [32:36<19:21, 5.67s/it]
62%|βββββββ | 332/536 [32:39<16:57, 4.99s/it]
62%|βββββββ | 333/536 [32:42<14:06, 4.17s/it]
62%|βββββββ | 334/536 [32:44<12:07, 3.60s/it]
62%|βββββββ | 335/536 [32:46<10:43, 3.20s/it]
{'loss': '0.4412', 'grad_norm': '0.2539', 'learning_rate': '3.729e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6745', 'tokens/total': 43909120, 'tokens/trainable': 40848032, 'epoch': '1.279'} |
|
62%|βββββββ | 335/536 [32:46<10:43, 3.20s/it]
63%|βββββββ | 336/536 [32:48<09:46, 2.93s/it]
63%|βββββββ | 337/536 [32:51<09:05, 2.74s/it]
63%|βββββββ | 338/536 [32:53<08:37, 2.61s/it]
63%|βββββββ | 339/536 [32:55<08:14, 2.51s/it]
63%|βββββββ | 340/536 [32:58<08:04, 2.47s/it]
{'loss': '0.4615', 'grad_norm': '0.2217', 'learning_rate': '3.573e-06', 'ppl': '1.586', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6370', 'tokens/total': 44564480, 'tokens/trainable': 41457624, 'epoch': '1.298'} |
|
63%|βββββββ | 340/536 [32:58<08:04, 2.47s/it]
64%|βββββββ | 341/536 [33:00<08:05, 2.49s/it]
64%|βββββββ | 342/536 [33:03<07:51, 2.43s/it]
64%|βββββββ | 343/536 [33:05<07:49, 2.43s/it]
64%|βββββββ | 344/536 [33:08<07:51, 2.46s/it]
64%|βββββββ | 345/536 [33:10<07:38, 2.40s/it]
{'loss': '0.4599', 'grad_norm': '0.2188', 'learning_rate': '3.418e-06', 'ppl': '1.584', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6795', 'tokens/total': 45219840, 'tokens/trainable': 42069272, 'epoch': '1.317'} |
|
64%|βββββββ | 345/536 [33:10<07:38, 2.40s/it]
65%|βββββββ | 346/536 [33:12<07:29, 2.37s/it]
65%|βββββββ | 347/536 [33:14<07:26, 2.36s/it]
65%|βββββββ | 348/536 [33:17<07:20, 2.34s/it]
65%|βββββββ | 349/536 [33:19<07:12, 2.32s/it]
65%|βββββββ | 350/536 [33:21<07:09, 2.31s/it]
{'loss': '0.4499', 'grad_norm': '0.2148', 'learning_rate': '3.264e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6691', 'tokens/total': 45875200, 'tokens/trainable': 42681132, 'epoch': '1.336'} |
|
65%|βββββββ | 350/536 [33:21<07:09, 2.31s/it]
65%|βββββββ | 351/536 [33:24<07:04, 2.29s/it][2026-03-16 19:44:58,459] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-351 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.55s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.55s/it] |
|
66%|βββββββ | 352/536 [35:02<1:35:17, 31.07s/it]
66%|βββββββ | 353/536 [35:04<1:08:31, 22.47s/it]
66%|βββββββ | 354/536 [35:06<49:47, 16.42s/it]
66%|βββββββ | 355/536 [35:09<36:49, 12.21s/it]
{'loss': '0.4529', 'grad_norm': '0.249', 'learning_rate': '3.113e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6371', 'tokens/total': 46530560, 'tokens/trainable': 43292904, 'epoch': '1.355'} |
|
66%|βββββββ | 355/536 [35:09<36:49, 12.21s/it]
66%|βββββββ | 356/536 [35:11<27:37, 9.21s/it]
67%|βββββββ | 357/536 [35:13<21:14, 7.12s/it]
67%|βββββββ | 358/536 [35:16<16:48, 5.67s/it]
67%|βββββββ | 359/536 [35:18<13:56, 4.73s/it]
67%|βββββββ | 360/536 [35:21<11:51, 4.04s/it]
{'loss': '0.4461', 'grad_norm': '0.2207', 'learning_rate': '2.963e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6150', 'tokens/total': 47185920, 'tokens/trainable': 43900272, 'epoch': '1.374'} |
|
67%|βββββββ | 360/536 [35:21<11:51, 4.04s/it]
67%|βββββββ | 361/536 [35:23<10:14, 3.51s/it]
68%|βββββββ | 362/536 [35:25<09:24, 3.25s/it]
68%|βββββββ | 363/536 [35:28<08:33, 2.97s/it]
68%|βββββββ | 364/536 [35:30<07:52, 2.75s/it]
68%|βββββββ | 365/536 [35:32<07:23, 2.59s/it]
{'loss': '0.4581', 'grad_norm': '0.3555', 'learning_rate': '2.816e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6767', 'tokens/total': 47841280, 'tokens/trainable': 44509872, 'epoch': '1.393'} |
|
68%|βββββββ | 365/536 [35:32<07:23, 2.59s/it]
68%|βββββββ | 366/536 [35:34<07:04, 2.50s/it]
68%|βββββββ | 367/536 [35:37<06:49, 2.42s/it]
69%|βββββββ | 368/536 [35:39<06:39, 2.38s/it]
69%|βββββββ | 369/536 [35:41<06:33, 2.36s/it]
69%|βββββββ | 370/536 [35:44<06:33, 2.37s/it]
{'loss': '0.4483', 'grad_norm': '0.2109', 'learning_rate': '2.671e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6359', 'tokens/total': 48496640, 'tokens/trainable': 45121444, 'epoch': '1.412'} |
|
69%|βββββββ | 370/536 [35:44<06:33, 2.37s/it]
69%|βββββββ | 371/536 [35:46<06:27, 2.35s/it]
69%|βββββββ | 372/536 [35:48<06:25, 2.35s/it]
70%|βββββββ | 373/536 [35:51<06:17, 2.31s/it]
70%|βββββββ | 374/536 [35:53<06:12, 2.30s/it]
70%|βββββββ | 375/536 [35:55<06:13, 2.32s/it]
{'loss': '0.4475', 'grad_norm': '0.2617', 'learning_rate': '2.528e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6505', 'tokens/total': 49152000, 'tokens/trainable': 45733296, 'epoch': '1.431'} |
|
70%|βββββββ | 375/536 [35:55<06:13, 2.32s/it]
70%|βββββββ | 376/536 [35:58<06:09, 2.31s/it]
70%|βββββββ | 377/536 [36:00<06:26, 2.43s/it]
71%|βββββββ | 378/536 [36:03<06:17, 2.39s/it][2026-03-16 19:47:37,290] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-378 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.77s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.77s/it] |
|
71%|βββββββ | 379/536 [37:39<1:20:25, 30.73s/it]
71%|βββββββ | 380/536 [37:42<57:44, 22.21s/it]
{'loss': '0.4467', 'grad_norm': '0.208', 'learning_rate': '2.388e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6639', 'tokens/total': 49807360, 'tokens/trainable': 46341868, 'epoch': '1.45'} |
|
71%|βββββββ | 380/536 [37:42<57:44, 22.21s/it]
71%|βββββββ | 381/536 [37:44<41:54, 16.22s/it]
71%|ββββββββ | 382/536 [37:46<31:01, 12.09s/it]
71%|ββββββββ | 383/536 [37:49<23:20, 9.16s/it]
72%|ββββββββ | 384/536 [37:51<17:57, 7.09s/it]
72%|ββββββββ | 385/536 [37:53<14:15, 5.66s/it]
{'loss': '0.4373', 'grad_norm': '0.2129', 'learning_rate': '2.251e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6487', 'tokens/total': 50462720, 'tokens/trainable': 46948144, 'epoch': '1.469'} |
|
72%|ββββββββ | 385/536 [37:53<14:15, 5.66s/it]
72%|ββββββββ | 386/536 [37:56<11:39, 4.67s/it]
72%|ββββββββ | 387/536 [37:58<09:46, 3.94s/it]
72%|ββββββββ | 388/536 [38:00<08:31, 3.45s/it]
73%|ββββββββ | 389/536 [38:03<07:40, 3.14s/it]
73%|ββββββββ | 390/536 [38:05<07:00, 2.88s/it]
{'loss': '0.452', 'grad_norm': '0.2314', 'learning_rate': '2.117e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 51118080, 'tokens/trainable': 47558056, 'epoch': '1.489'} |
|
73%|ββββββββ | 390/536 [38:05<07:00, 2.88s/it]
73%|ββββββββ | 391/536 [38:08<07:22, 3.05s/it]
73%|ββββββββ | 392/536 [38:11<06:49, 2.85s/it]
73%|ββββββββ | 393/536 [38:13<06:22, 2.68s/it]
74%|ββββββββ | 394/536 [38:15<06:02, 2.55s/it]
74%|ββββββββ | 395/536 [38:18<06:01, 2.56s/it]
{'loss': '0.4435', 'grad_norm': '0.2139', 'learning_rate': '1.985e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5877', 'tokens/total': 51773440, 'tokens/trainable': 48168520, 'epoch': '1.508'} |
|
74%|ββββββββ | 395/536 [38:18<06:01, 2.56s/it]
74%|ββββββββ | 396/536 [38:20<05:47, 2.48s/it]
74%|ββββββββ | 397/536 [38:22<05:35, 2.42s/it]
74%|ββββββββ | 398/536 [38:25<05:27, 2.37s/it]
74%|ββββββββ | 399/536 [38:27<05:22, 2.35s/it]
75%|ββββββββ | 400/536 [38:29<05:21, 2.37s/it]
{'loss': '0.4444', 'grad_norm': '0.2236', 'learning_rate': '1.857e-06', 'ppl': '1.56', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6406', 'tokens/total': 52428800, 'tokens/trainable': 48779416, 'epoch': '1.527'} |
|
75%|ββββββββ | 400/536 [38:29<05:21, 2.37s/it]
75%|ββββββββ | 401/536 [38:32<05:15, 2.34s/it]
75%|ββββββββ | 402/536 [38:34<05:10, 2.32s/it]
75%|ββββββββ | 403/536 [38:36<05:10, 2.34s/it]
75%|ββββββββ | 404/536 [38:39<05:07, 2.33s/it]
76%|ββββββββ | 405/536 [38:41<05:03, 2.32s/it]
{'loss': '0.4557', 'grad_norm': '0.2324', 'learning_rate': '1.732e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6719', 'tokens/total': 53084160, 'tokens/trainable': 49387032, 'epoch': '1.546'} |
|
76%|ββββββββ | 405/536 [38:41<05:03, 2.32s/it][2026-03-16 19:50:15,755] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-405 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.46s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.46s/it] |
|
76%|ββββββββ | 406/536 [40:19<1:06:58, 30.91s/it]
76%|ββββββββ | 407/536 [40:21<48:00, 22.33s/it]
76%|ββββββββ | 408/536 [40:23<34:49, 16.33s/it]
76%|ββββββββ | 409/536 [40:25<25:39, 12.12s/it]
76%|ββββββββ | 410/536 [40:28<19:38, 9.36s/it]
{'loss': '0.4617', 'grad_norm': '0.2168', 'learning_rate': '1.611e-06', 'ppl': '1.587', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '5250', 'tokens/total': 53739520, 'tokens/trainable': 49995344, 'epoch': '1.565'} |
|
76%|ββββββββ | 410/536 [40:28<19:38, 9.36s/it]
77%|ββββββββ | 411/536 [40:31<15:03, 7.23s/it]
77%|ββββββββ | 412/536 [40:33<11:54, 5.76s/it]
77%|ββββββββ | 413/536 [40:36<09:53, 4.83s/it]
77%|ββββββββ | 414/536 [40:38<08:16, 4.07s/it]
77%|ββββββββ | 415/536 [40:40<07:07, 3.53s/it]
{'loss': '0.4492', 'grad_norm': '0.2217', 'learning_rate': '1.493e-06', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6671', 'tokens/total': 54394880, 'tokens/trainable': 50603264, 'epoch': '1.584'} |
|
77%|ββββββββ | 415/536 [40:40<07:07, 3.53s/it]
78%|ββββββββ | 416/536 [40:43<06:19, 3.17s/it]
78%|ββββββββ | 417/536 [40:45<05:45, 2.90s/it]
78%|ββββββββ | 418/536 [40:47<05:26, 2.77s/it]
78%|ββββββββ | 419/536 [40:50<05:18, 2.73s/it]
78%|ββββββββ | 420/536 [40:52<05:02, 2.61s/it]
{'loss': '0.4522', 'grad_norm': '0.2676', 'learning_rate': '1.379e-06', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6557', 'tokens/total': 55050240, 'tokens/trainable': 51213612, 'epoch': '1.603'} |
|
78%|ββββββββ | 420/536 [40:52<05:02, 2.61s/it]
79%|ββββββββ | 421/536 [40:55<04:49, 2.52s/it]
79%|ββββββββ | 422/536 [40:57<04:40, 2.46s/it]
79%|ββββββββ | 423/536 [40:59<04:40, 2.49s/it]
79%|ββββββββ | 424/536 [41:02<04:29, 2.41s/it]
79%|ββββββββ | 425/536 [41:04<04:21, 2.36s/it]
{'loss': '0.4414', 'grad_norm': '0.2168', 'learning_rate': '1.269e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6755', 'tokens/total': 55705600, 'tokens/trainable': 51819592, 'epoch': '1.622'} |
|
79%|ββββββββ | 425/536 [41:04<04:21, 2.36s/it]
79%|ββββββββ | 426/536 [41:06<04:17, 2.34s/it]
80%|ββββββββ | 427/536 [41:08<04:13, 2.32s/it]
80%|ββββββββ | 428/536 [41:11<04:08, 2.30s/it]
80%|ββββββββ | 429/536 [41:13<04:07, 2.31s/it]
80%|ββββββββ | 430/536 [41:15<04:04, 2.31s/it]
{'loss': '0.4532', 'grad_norm': '0.2217', 'learning_rate': '1.163e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6642', 'tokens/total': 56360960, 'tokens/trainable': 52431520, 'epoch': '1.641'} |
|
80%|ββββββββ | 430/536 [41:15<04:04, 2.31s/it]
80%|ββββββββ | 431/536 [41:18<04:10, 2.38s/it]
81%|ββββββββ | 432/536 [41:20<04:04, 2.35s/it][2026-03-16 19:52:55,057] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-432 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.45s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.45s/it] |
|
81%|ββββββββ | 433/536 [42:58<53:14, 31.01s/it]
81%|ββββββββ | 434/536 [43:00<38:05, 22.41s/it]
81%|ββββββββ | 435/536 [43:03<27:33, 16.37s/it]
{'loss': '0.4605', 'grad_norm': '0.3574', 'learning_rate': '1.061e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6726', 'tokens/total': 57016320, 'tokens/trainable': 53041944, 'epoch': '1.66'} |
|
81%|ββββββββ | 435/536 [43:03<27:33, 16.37s/it]
81%|βββββββββ | 436/536 [43:05<20:20, 12.20s/it]
82%|βββββββββ | 437/536 [43:07<15:11, 9.20s/it]
82%|βββββββββ | 438/536 [43:10<11:41, 7.16s/it]
82%|βββββββββ | 439/536 [43:12<09:11, 5.68s/it]
82%|βββββββββ | 440/536 [43:14<07:29, 4.68s/it]
{'loss': '0.446', 'grad_norm': '0.2119', 'learning_rate': '9.626e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6513', 'tokens/total': 57671680, 'tokens/trainable': 53647180, 'epoch': '1.679'} |
|
82%|βββββββββ | 440/536 [43:14<07:29, 4.68s/it]
82%|βββββββββ | 441/536 [43:17<06:23, 4.04s/it]
82%|βββββββββ | 442/536 [43:19<05:31, 3.53s/it]
83%|βββββββββ | 443/536 [43:21<04:53, 3.16s/it]
83%|βββββββββ | 444/536 [43:24<04:26, 2.90s/it]
83%|βββββββββ | 445/536 [43:26<04:04, 2.69s/it]
{'loss': '0.4299', 'grad_norm': '0.2188', 'learning_rate': '8.688e-07', 'ppl': '1.537', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6951', 'tokens/total': 58327040, 'tokens/trainable': 54256432, 'epoch': '1.698'} |
|
83%|βββββββββ | 445/536 [43:26<04:04, 2.69s/it]
83%|βββββββββ | 446/536 [43:28<03:52, 2.58s/it]
83%|βββββββββ | 447/536 [43:31<03:41, 2.49s/it]
84%|βββββββββ | 448/536 [43:33<03:32, 2.41s/it]
84%|βββββββββ | 449/536 [43:35<03:30, 2.42s/it]
84%|βββββββββ | 450/536 [43:38<03:23, 2.37s/it]
{'loss': '0.4583', 'grad_norm': '0.2188', 'learning_rate': '7.794e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6777', 'tokens/total': 58982400, 'tokens/trainable': 54863148, 'epoch': '1.718'} |
|
84%|βββββββββ | 450/536 [43:38<03:23, 2.37s/it]
84%|βββββββββ | 451/536 [43:40<03:28, 2.46s/it]
84%|βββββββββ | 452/536 [43:43<03:24, 2.44s/it]
85%|βββββββββ | 453/536 [43:45<03:19, 2.40s/it]
85%|βββββββββ | 454/536 [43:47<03:15, 2.38s/it]
85%|βββββββββ | 455/536 [43:50<03:12, 2.38s/it]
{'loss': '0.4523', 'grad_norm': '0.2119', 'learning_rate': '6.945e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6454', 'tokens/total': 59637760, 'tokens/trainable': 55471580, 'epoch': '1.737'} |
|
85%|βββββββββ | 455/536 [43:50<03:12, 2.38s/it]
85%|βββββββββ | 456/536 [43:52<03:15, 2.44s/it]
85%|βββββββββ | 457/536 [43:54<03:10, 2.41s/it]
85%|βββββββββ | 458/536 [43:57<03:11, 2.46s/it]
86%|βββββββββ | 459/536 [43:59<03:05, 2.41s/it][2026-03-16 19:55:34,637] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-459 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.63s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.63s/it] |
|
86%|βββββββββ | 460/536 [45:37<39:11, 30.94s/it]
{'loss': '0.4523', 'grad_norm': '0.2148', 'learning_rate': '6.141e-07', 'ppl': '1.572', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6711', 'tokens/total': 60293120, 'tokens/trainable': 56081832, 'epoch': '1.756'} |
|
86%|βββββββββ | 460/536 [45:37<39:11, 30.94s/it]
86%|βββββββββ | 461/536 [45:39<27:55, 22.34s/it]
86%|βββββββββ | 462/536 [45:41<20:06, 16.30s/it]
86%|βββββββββ | 463/536 [45:44<14:42, 12.09s/it]
87%|βββββββββ | 464/536 [45:46<10:58, 9.14s/it]
87%|βββββββββ | 465/536 [45:48<08:21, 7.06s/it]
{'loss': '0.4461', 'grad_norm': '0.4551', 'learning_rate': '5.383e-07', 'ppl': '1.562', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6903', 'tokens/total': 60948480, 'tokens/trainable': 56694240, 'epoch': '1.775'} |
|
87%|βββββββββ | 465/536 [45:48<08:21, 7.06s/it]
87%|βββββββββ | 466/536 [45:50<06:33, 5.62s/it]
87%|βββββββββ | 467/536 [45:53<05:23, 4.69s/it]
87%|βββββββββ | 468/536 [45:55<04:29, 3.97s/it]
88%|βββββββββ | 469/536 [45:57<03:50, 3.44s/it]
88%|βββββββββ | 470/536 [46:01<03:45, 3.42s/it]
{'loss': '0.4341', 'grad_norm': '0.208', 'learning_rate': '4.673e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4512', 'tokens/total': 61603840, 'tokens/trainable': 57301404, 'epoch': '1.794'} |
|
88%|βββββββββ | 470/536 [46:01<03:45, 3.42s/it]
88%|βββββββββ | 471/536 [46:03<03:20, 3.09s/it]
88%|βββββββββ | 472/536 [46:05<03:02, 2.86s/it]
88%|βββββββββ | 473/536 [46:08<02:50, 2.70s/it]
88%|βββββββββ | 474/536 [46:10<02:39, 2.58s/it]
89%|βββββββββ | 475/536 [46:12<02:34, 2.54s/it]
{'loss': '0.4627', 'grad_norm': '0.2461', 'learning_rate': '4.011e-07', 'ppl': '1.588', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6249', 'tokens/total': 62259200, 'tokens/trainable': 57910216, 'epoch': '1.813'} |
|
89%|βββββββββ | 475/536 [46:12<02:34, 2.54s/it]
89%|βββββββββ | 476/536 [46:15<02:26, 2.45s/it]
89%|βββββββββ | 477/536 [46:17<02:24, 2.45s/it]
89%|βββββββββ | 478/536 [46:20<02:25, 2.51s/it]
89%|βββββββββ | 479/536 [46:22<02:19, 2.45s/it]
90%|βββββββββ | 480/536 [46:24<02:13, 2.39s/it]
{'loss': '0.4538', 'grad_norm': '0.2178', 'learning_rate': '3.397e-07', 'ppl': '1.574', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6635', 'tokens/total': 62914560, 'tokens/trainable': 58517712, 'epoch': '1.832'} |
|
90%|βββββββββ | 480/536 [46:24<02:13, 2.39s/it]
90%|βββββββββ | 481/536 [46:27<02:09, 2.36s/it]
90%|βββββββββ | 482/536 [46:29<02:05, 2.33s/it]
90%|βββββββββ | 483/536 [46:31<02:03, 2.33s/it]
90%|βββββββββ | 484/536 [46:34<02:01, 2.33s/it]
90%|βββββββββ | 485/536 [46:36<02:04, 2.44s/it]
{'loss': '0.4395', 'grad_norm': '0.208', 'learning_rate': '2.833e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6504', 'tokens/total': 63569920, 'tokens/trainable': 59125608, 'epoch': '1.851'} |
|
90%|βββββββββ | 485/536 [46:36<02:04, 2.44s/it]
91%|βββββββββ | 486/536 [46:39<01:59, 2.39s/it][2026-03-16 19:58:13,418] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-486 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.82s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.82s/it] |
|
91%|βββββββββ | 487/536 [48:16<25:09, 30.81s/it]
91%|βββββββββ | 488/536 [48:18<17:48, 22.26s/it]
91%|βββββββββ | 489/536 [48:20<12:44, 16.26s/it]
91%|ββββββββββ| 490/536 [48:24<09:32, 12.45s/it]
{'loss': '0.4478', 'grad_norm': '0.2236', 'learning_rate': '2.318e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '4290', 'tokens/total': 64225280, 'tokens/trainable': 59733412, 'epoch': '1.87'} |
|
91%|ββββββββββ| 490/536 [48:24<09:32, 12.45s/it]
92%|ββββββββββ| 491/536 [48:26<07:02, 9.39s/it]
92%|ββββββββββ| 492/536 [48:28<05:19, 7.26s/it]
92%|ββββββββββ| 493/536 [48:31<04:08, 5.78s/it]
92%|ββββββββββ| 494/536 [48:33<03:22, 4.82s/it]
92%|ββββββββββ| 495/536 [48:36<02:47, 4.08s/it]
{'loss': '0.4362', 'grad_norm': '0.2129', 'learning_rate': '1.854e-07', 'ppl': '1.547', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6524', 'tokens/total': 64880640, 'tokens/trainable': 60339904, 'epoch': '1.889'} |
|
92%|ββββββββββ| 495/536 [48:36<02:47, 4.08s/it]
93%|ββββββββββ| 496/536 [48:38<02:25, 3.64s/it]
93%|ββββββββββ| 497/536 [48:40<02:06, 3.23s/it]
93%|ββββββββββ| 498/536 [48:43<01:52, 2.96s/it]
93%|ββββββββββ| 499/536 [48:45<01:42, 2.76s/it]
93%|ββββββββββ| 500/536 [48:47<01:34, 2.62s/it]
{'loss': '0.4656', 'grad_norm': '0.2217', 'learning_rate': '1.441e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6649', 'tokens/total': 65536000, 'tokens/trainable': 60945412, 'epoch': '1.908'} |
|
93%|ββββββββββ| 500/536 [48:47<01:34, 2.62s/it]
93%|ββββββββββ| 501/536 [48:50<01:29, 2.55s/it]
94%|ββββββββββ| 502/536 [48:52<01:25, 2.52s/it]
94%|ββββββββββ| 503/536 [48:54<01:20, 2.44s/it]
94%|ββββββββββ| 504/536 [48:57<01:16, 2.39s/it]
94%|ββββββββββ| 505/536 [48:59<01:13, 2.36s/it]
{'loss': '0.4466', 'grad_norm': '0.2129', 'learning_rate': '1.079e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6658', 'tokens/total': 66191360, 'tokens/trainable': 61550936, 'epoch': '1.927'} |
|
94%|ββββββββββ| 505/536 [48:59<01:13, 2.36s/it]
94%|ββββββββββ| 506/536 [49:01<01:09, 2.33s/it]
95%|ββββββββββ| 507/536 [49:04<01:07, 2.31s/it]
95%|ββββββββββ| 508/536 [49:06<01:04, 2.31s/it]
95%|ββββββββββ| 509/536 [49:08<01:04, 2.37s/it]
95%|ββββββββββ| 510/536 [49:11<01:01, 2.35s/it]
{'loss': '0.4754', 'grad_norm': '0.2168', 'learning_rate': '7.691e-08', 'ppl': '1.609', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6670', 'tokens/total': 66846720, 'tokens/trainable': 62157088, 'epoch': '1.947'} |
|
95%|ββββββββββ| 510/536 [49:11<01:01, 2.35s/it]
95%|ββββββββββ| 511/536 [49:13<00:58, 2.34s/it]
96%|ββββββββββ| 512/536 [49:15<00:55, 2.32s/it]
96%|ββββββββββ| 513/536 [49:18<00:53, 2.31s/it][2026-03-16 20:00:52,618] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-513 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.79s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.79s/it] |
|
96%|ββββββββββ| 514/536 [50:57<11:33, 31.50s/it]
96%|ββββββββββ| 515/536 [51:00<07:58, 22.77s/it]
{'loss': '0.4548', 'grad_norm': '0.2217', 'learning_rate': '5.11e-08', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6342', 'tokens/total': 67502080, 'tokens/trainable': 62762592, 'epoch': '1.966'} |
|
96%|ββββββββββ| 515/536 [51:00<07:58, 22.77s/it]
96%|ββββββββββ| 516/536 [51:02<05:32, 16.65s/it]
96%|ββββββββββ| 517/536 [51:04<03:54, 12.32s/it]
97%|ββββββββββ| 518/536 [51:06<02:47, 9.30s/it]
97%|ββββββββββ| 519/536 [51:09<02:02, 7.18s/it]
97%|ββββββββββ| 520/536 [51:11<01:32, 5.76s/it]
{'loss': '0.4544', 'grad_norm': '0.2314', 'learning_rate': '3.054e-08', 'ppl': '1.575', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6125', 'tokens/total': 68157440, 'tokens/trainable': 63366800, 'epoch': '1.985'} |
|
97%|ββββββββββ| 520/536 [51:11<01:32, 5.76s/it]
97%|ββββββββββ| 521/536 [51:13<01:10, 4.72s/it]
97%|ββββββββββ| 522/536 [51:16<00:55, 3.98s/it]
98%|ββββββββββ| 523/536 [51:18<00:45, 3.47s/it]
98%|ββββββββββ| 524/536 [51:20<00:38, 3.19s/it]
98%|ββββββββββ| 525/536 [51:24<00:35, 3.21s/it]
{'loss': '0.4504', 'grad_norm': '0.2246', 'learning_rate': '1.522e-08', 'ppl': '1.569', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6753', 'tokens/total': 68812800, 'tokens/trainable': 63975056, 'epoch': '2.004'} |
|
98%|ββββββββββ| 525/536 [51:24<00:35, 3.21s/it]
98%|ββββββββββ| 526/536 [51:26<00:29, 2.92s/it]
98%|ββββββββββ| 527/536 [51:28<00:24, 2.75s/it]
99%|ββββββββββ| 528/536 [51:31<00:21, 2.71s/it]
99%|ββββββββββ| 529/536 [51:33<00:18, 2.60s/it]
99%|ββββββββββ| 530/536 [51:36<00:15, 2.52s/it]
{'loss': '0.4546', 'grad_norm': '0.2109', 'learning_rate': '5.182e-09', 'ppl': '1.576', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6575', 'tokens/total': 69468160, 'tokens/trainable': 64586448, 'epoch': '2.023'} |
|
99%|ββββββββββ| 530/536 [51:36<00:15, 2.52s/it]
99%|ββββββββββ| 531/536 [51:38<00:12, 2.45s/it]
99%|ββββββββββ| 532/536 [51:40<00:09, 2.40s/it]
99%|ββββββββββ| 533/536 [51:43<00:07, 2.40s/it]
100%|ββββββββββ| 534/536 [51:45<00:04, 2.39s/it]
100%|ββββββββββ| 535/536 [51:47<00:02, 2.42s/it]
{'loss': '0.4493', 'grad_norm': '0.2314', 'learning_rate': '4.231e-10', 'ppl': '1.567', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'tokens/train_per_sec_per_gpu': '6115', 'tokens/total': 70123520, 'tokens/trainable': 65199364, 'epoch': '2.042'} |
|
100%|ββββββββββ| 535/536 [51:47<00:02, 2.42s/it]
100%|ββββββββββ| 536/536 [51:50<00:00, 2.37s/it][2026-03-16 20:03:25,941] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/checkpoint-536 |
|
|
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s][A |
|
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.68s/it][A
Writing model shards: 100%|ββββββββββ| 1/1 [00:16<00:00, 16.68s/it] |
|
{'train_runtime': '3210', 'train_samples_per_second': '2.672', 'train_steps_per_second': '0.167', 'train_loss': '0.4897', 'memory/max_active (GiB)': '22.63', 'memory/max_allocated (GiB)': '22.63', 'memory/device_reserved (GiB)': '29.23', 'epoch': '2.046', 'tokens/train_per_sec_per_gpu': '6757'} |
|
100%|ββββββββββ| 536/536 [53:26<00:00, 2.37s/it]
100%|ββββββββββ| 536/536 [53:26<00:00, 5.98s/it] |
| [2026-03-16 20:04:52,263] [INFO] [axolotl.train.save_trained_model:237] [PID:213] Training completed! Saving trained model to ./outputs/qwen3-sft-stmt-tk/. |
| [2026-03-16 20:05:01,009] [INFO] [axolotl.core.trainers.base._save:721] [PID:213] Saving model checkpoint to ./outputs/qwen3-sft-stmt-tk/ |
|
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.53s/it]
Writing model shards: 100%|ββββββββββ| 1/1 [00:17<00:00, 17.53s/it] |
| [2026-03-16 20:05:19,091] [INFO] [axolotl.train.save_trained_model:351] [PID:213] Model successfully saved to ./outputs/qwen3-sft-stmt-tk/ |
|
|