3b-qat-nvfp4 / debug.log
AlexHung29629's picture
Training in progress, epoch 0
d3fcddb verified
[2025-11-06 16:11:41,516] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:837642] baseline 0.000GB ()
[2025-11-06 16:11:41,516] [INFO] [axolotl.cli.config.load_cfg:248] [PID:837642] config:
{
"activation_offloading": false,
"axolotl_config_path": "3b-qat-nvfp4.yaml",
"base_model": "meta-llama/Llama-3.2-3B",
"base_model_config": "meta-llama/Llama-3.2-3B",
"batch_size": 64,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": false,
"n_gpu": 1,
"n_node": 1
},
"context_parallel_size": 1,
"cosine_constant_lr_ratio": 0.0,
"cosine_min_lr_ratio": 1.0,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_prepared_path": "./outputs/dataset_prepared",
"dataset_processes": 128,
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "yahma/alpaca-cleaned",
"split": "train[:95%]",
"trust_remote_code": false,
"type": "alpaca"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.8.0"
},
"eval_batch_size": 64,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_table_size": 0,
"evals_per_epoch": 1,
"experimental_skip_move_to_device": true,
"flash_attention": true,
"fp16": false,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": true
},
"hub_model_id": "AlexHung29629/3b-qat-nvfp4",
"include_tkps": true,
"is_llama_derived_model": true,
"learning_rate": 2e-05,
"liger_fused_linear_cross_entropy": true,
"liger_glu_activation": true,
"liger_layer_norm": true,
"liger_rms_norm": true,
"liger_rope": true,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_dropout": 0.0,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"micro_batch_size": 64,
"model_config_type": "llama",
"num_epochs": 1.0,
"optimizer": "adamw_torch_fused",
"output_dir": "./outputs/qat_out/",
"plugins": [
"axolotl.integrations.liger.LigerPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qat": {
"activation_dtype": "TorchAOQuantDType.nvfp4",
"group_size": 16,
"quantize_embedding": false,
"weight_dtype": "TorchAOQuantDType.nvfp4"
},
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_first_step": true,
"save_only_model": true,
"save_safetensors": true,
"saves_per_epoch": 1,
"sequence_len": 8192,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"special_tokens": {
"pad_token": "<|finetune_right_pad_id|>"
},
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "meta-llama/Llama-3.2-3B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.1,
"weight_decay": 0.0,
"world_size": 1
}
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
[2025-11-06 16:11:46,489] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:837642] Unable to find prepared dataset in outputs/dataset_prepared/9bc662aed65b76546b2d635b3957a343
[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:837642] Loading raw datasets...
[2025-11-06 16:11:46,492] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:837642] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Generating train split: 0%| | 0/51760 [00:00<?, ? examples/s] Generating train split: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 51760/51760 [00:00<00:00, 229845.07 examples/s] Generating train split: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 51760/51760 [00:00<00:00, 228708.96 examples/s]
[2025-11-06 16:12:08,982] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:837642] Loading dataset: yahma/alpaca-cleaned with base_type: alpaca and prompt_style: None
Tokenizing Prompts (num_proc=128): 0%| | 0/49172 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=128): 0%| | 84/49172 [00:01<13:38, 59.94 examples/s] Tokenizing Prompts (num_proc=128): 0%| | 176/49172 [00:01<06:01, 135.38 examples/s] Tokenizing Prompts (num_proc=128): 2%|▏ | 792/49172 [00:01<01:05, 743.93 examples/s] Tokenizing Prompts (num_proc=128): 3%|β–Ž | 1289/49172 [00:01<00:39, 1224.84 examples/s] Tokenizing Prompts (num_proc=128): 3%|β–Ž | 1663/49172 [00:01<00:30, 1550.01 examples/s] Tokenizing Prompts (num_proc=128): 4%|▍ | 2053/49172 [00:02<00:25, 1838.37 examples/s] Tokenizing Prompts (num_proc=128): 5%|β–Œ | 2462/49172 [00:02<00:22, 2120.82 examples/s] Tokenizing Prompts (num_proc=128): 6%|β–Œ | 2931/49172 [00:02<00:17, 2593.78 examples/s] Tokenizing Prompts (num_proc=128): 7%|β–‹ | 3316/49172 [00:02<00:17, 2658.78 examples/s] Tokenizing Prompts (num_proc=128): 8%|β–Š | 3704/49172 [00:02<00:16, 2738.71 examples/s] Tokenizing Prompts (num_proc=128): 8%|β–Š | 4072/49172 [00:02<00:16, 2765.32 examples/s] Tokenizing Prompts (num_proc=128): 9%|β–‰ | 4392/49172 [00:02<00:16, 2672.66 examples/s] Tokenizing Prompts (num_proc=128): 10%|β–‰ | 4715/49172 [00:03<00:17, 2538.79 examples/s] Tokenizing Prompts (num_proc=128): 10%|β–ˆ | 5094/49172 [00:03<00:17, 2586.15 examples/s] Tokenizing Prompts (num_proc=128): 11%|β–ˆ | 5504/49172 [00:03<00:16, 2692.82 examples/s] Tokenizing Prompts (num_proc=128): 12%|β–ˆβ– | 5910/49172 [00:03<00:15, 2763.20 examples/s] Tokenizing Prompts (num_proc=128): 13%|β–ˆβ–Ž | 6311/49172 [00:03<00:14, 2975.07 examples/s] Tokenizing Prompts (num_proc=128): 14%|β–ˆβ–Ž | 6668/49172 [00:03<00:14, 2859.49 examples/s] Tokenizing Prompts (num_proc=128): 14%|β–ˆβ– | 7058/49172 [00:03<00:14, 2844.49 examples/s] Tokenizing Prompts (num_proc=128): 15%|β–ˆβ–Œ | 7385/49172 [00:03<00:15, 2728.62 examples/s] Tokenizing Prompts (num_proc=128): 16%|β–ˆβ–Œ | 7836/49172 [00:04<00:13, 3015.74 examples/s] Tokenizing Prompts (num_proc=128): 17%|β–ˆβ–‹ | 8193/49172 [00:04<00:14, 2873.13 examples/s] Tokenizing Prompts (num_proc=128): 17%|β–ˆβ–‹ | 8537/49172 [00:04<00:14, 2768.24 examples/s] Tokenizing Prompts (num_proc=128): 18%|β–ˆβ–Š | 8959/49172 [00:04<00:13, 2885.47 examples/s] Tokenizing Prompts (num_proc=128): 19%|β–ˆβ–‰ | 9339/49172 [00:04<00:14, 2805.07 examples/s] Tokenizing Prompts (num_proc=128): 20%|β–ˆβ–‰ | 9755/49172 [00:04<00:13, 2846.53 examples/s] Tokenizing Prompts (num_proc=128): 21%|β–ˆβ–ˆ | 10145/49172 [00:04<00:13, 2839.00 examples/s] Tokenizing Prompts (num_proc=128): 22%|β–ˆβ–ˆβ– | 10628/49172 [00:05<00:12, 3040.97 examples/s] Tokenizing Prompts (num_proc=128): 22%|β–ˆβ–ˆβ– | 11039/49172 [00:05<00:11, 3253.40 examples/s] Tokenizing Prompts (num_proc=128): 23%|β–ˆβ–ˆβ–Ž | 11385/49172 [00:05<00:12, 2982.44 examples/s] Tokenizing Prompts (num_proc=128): 24%|β–ˆβ–ˆβ– | 11692/49172 [00:05<00:13, 2756.63 examples/s] Tokenizing Prompts (num_proc=128): 25%|β–ˆβ–ˆβ– | 12063/49172 [00:05<00:13, 2720.60 examples/s] Tokenizing Prompts (num_proc=128): 25%|β–ˆβ–ˆβ–Œ | 12463/49172 [00:05<00:13, 2810.95 examples/s] Tokenizing Prompts (num_proc=128): 26%|β–ˆβ–ˆβ–Œ | 12850/49172 [00:05<00:12, 2819.15 examples/s] Tokenizing Prompts (num_proc=128): 27%|β–ˆβ–ˆβ–‹ | 13292/49172 [00:06<00:12, 2922.32 examples/s] Tokenizing Prompts (num_proc=128): 28%|β–ˆβ–ˆβ–Š | 13600/49172 [00:06<00:13, 2700.65 examples/s] Tokenizing Prompts (num_proc=128): 28%|β–ˆβ–ˆβ–Š | 13989/49172 [00:06<00:12, 2716.86 examples/s] Tokenizing Prompts (num_proc=128): 29%|β–ˆβ–ˆβ–‰ | 14475/49172 [00:06<00:12, 2852.98 examples/s] Tokenizing Prompts (num_proc=128): 30%|β–ˆβ–ˆβ–ˆ | 14914/49172 [00:06<00:11, 2938.70 examples/s] Tokenizing Prompts (num_proc=128): 31%|β–ˆβ–ˆβ–ˆβ– | 15412/49172 [00:06<00:10, 3223.11 examples/s] Tokenizing Prompts (num_proc=128): 32%|β–ˆβ–ˆβ–ˆβ– | 15748/49172 [00:06<00:11, 3025.74 examples/s] Tokenizing Prompts (num_proc=128): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 16102/49172 [00:06<00:11, 2858.09 examples/s] Tokenizing Prompts (num_proc=128): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 16438/49172 [00:07<00:12, 2720.40 examples/s] Tokenizing Prompts (num_proc=128): 34%|β–ˆβ–ˆβ–ˆβ– | 16767/49172 [00:07<00:12, 2595.58 examples/s] Tokenizing Prompts (num_proc=128): 35%|β–ˆβ–ˆβ–ˆβ– | 17149/49172 [00:07<00:12, 2665.96 examples/s] Tokenizing Prompts (num_proc=128): 36%|β–ˆβ–ˆβ–ˆβ–Œ | 17516/49172 [00:07<00:11, 2643.92 examples/s] Tokenizing Prompts (num_proc=128): 37%|β–ˆβ–ˆβ–ˆβ–‹ | 17961/49172 [00:07<00:10, 2849.24 examples/s] Tokenizing Prompts (num_proc=128): 37%|β–ˆβ–ˆβ–ˆβ–‹ | 18279/49172 [00:07<00:10, 2822.57 examples/s] Tokenizing Prompts (num_proc=128): 38%|β–ˆβ–ˆβ–ˆβ–Š | 18574/49172 [00:07<00:11, 2591.21 examples/s] Tokenizing Prompts (num_proc=128): 39%|β–ˆβ–ˆβ–ˆβ–Š | 18957/49172 [00:08<00:11, 2625.56 examples/s] Tokenizing Prompts (num_proc=128): 40%|β–ˆβ–ˆβ–ˆβ–‰ | 19438/49172 [00:08<00:10, 2894.61 examples/s] Tokenizing Prompts (num_proc=128): 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 19776/49172 [00:08<00:10, 2741.27 examples/s] Tokenizing Prompts (num_proc=128): 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 20133/49172 [00:08<00:10, 2676.71 examples/s] Tokenizing Prompts (num_proc=128): 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20582/49172 [00:08<00:10, 2813.67 examples/s] Tokenizing Prompts (num_proc=128): 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 21056/49172 [00:08<00:09, 2991.88 examples/s] Tokenizing Prompts (num_proc=128): 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 21507/49172 [00:08<00:08, 3120.60 examples/s] Tokenizing Prompts (num_proc=128): 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21844/49172 [00:09<00:09, 2927.37 examples/s] Tokenizing Prompts (num_proc=128): 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 22214/49172 [00:09<00:09, 2980.95 examples/s] Tokenizing Prompts (num_proc=128): 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 22524/49172 [00:09<00:09, 2740.32 examples/s] Tokenizing Prompts (num_proc=128): 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 22828/49172 [00:09<00:09, 2639.81 examples/s] Tokenizing Prompts (num_proc=128): 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 23146/49172 [00:09<00:10, 2510.35 examples/s] Tokenizing Prompts (num_proc=128): 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 23619/49172 [00:09<00:09, 2763.04 examples/s] Tokenizing Prompts (num_proc=128): 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 24069/49172 [00:09<00:08, 2964.79 examples/s] Tokenizing Prompts (num_proc=128): 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 24455/49172 [00:09<00:08, 2924.79 examples/s] Tokenizing Prompts (num_proc=128): 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 24809/49172 [00:10<00:08, 2856.24 examples/s] Tokenizing Prompts (num_proc=128): 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 25112/49172 [00:10<00:09, 2621.31 examples/s] Tokenizing Prompts (num_proc=128): 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 25522/49172 [00:10<00:08, 2714.48 examples/s] Tokenizing Prompts (num_proc=128): 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 25873/49172 [00:10<00:08, 2705.01 examples/s] Tokenizing Prompts (num_proc=128): 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 26322/49172 [00:10<00:07, 2907.63 examples/s] Tokenizing Prompts (num_proc=128): 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 26671/49172 [00:10<00:08, 2787.83 examples/s] Tokenizing Prompts (num_proc=128): 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 27127/49172 [00:10<00:07, 2923.69 examples/s] Tokenizing Prompts (num_proc=128): 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 27506/49172 [00:11<00:07, 2929.98 examples/s] Tokenizing Prompts (num_proc=128): 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 27928/49172 [00:11<00:07, 2972.71 examples/s] Tokenizing Prompts (num_proc=128): 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 28285/49172 [00:11<00:06, 3017.41 examples/s] Tokenizing Prompts (num_proc=128): 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 28589/49172 [00:11<00:07, 2769.13 examples/s] Tokenizing Prompts (num_proc=128): 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 28960/49172 [00:11<00:07, 2746.25 examples/s] Tokenizing Prompts (num_proc=128): 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 29343/49172 [00:11<00:07, 2735.06 examples/s] Tokenizing Prompts (num_proc=128): 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 29730/49172 [00:11<00:07, 2777.41 examples/s] Tokenizing Prompts (num_proc=128): 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 30201/49172 [00:11<00:06, 3027.86 examples/s] Tokenizing Prompts (num_proc=128): 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 30511/49172 [00:12<00:06, 2794.02 examples/s] Tokenizing Prompts (num_proc=128): 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 30963/49172 [00:12<00:06, 2860.65 examples/s] Tokenizing Prompts (num_proc=128): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 31260/49172 [00:12<00:06, 2659.00 examples/s] Tokenizing Prompts (num_proc=128): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31661/49172 [00:12<00:06, 2744.06 examples/s] Tokenizing Prompts (num_proc=128): 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 32103/49172 [00:12<00:05, 2886.04 examples/s] Tokenizing Prompts (num_proc=128): 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 32523/49172 [00:12<00:05, 2982.93 examples/s] Tokenizing Prompts (num_proc=128): 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 32894/49172 [00:12<00:05, 2895.08 examples/s] Tokenizing Prompts (num_proc=128): 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 33277/49172 [00:13<00:05, 3037.84 examples/s] Tokenizing Prompts (num_proc=128): 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 33649/49172 [00:13<00:05, 2842.83 examples/s] Tokenizing Prompts (num_proc=128): 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 33971/49172 [00:13<00:05, 2690.27 examples/s] Tokenizing Prompts (num_proc=128): 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34337/49172 [00:13<00:05, 2695.46 examples/s] Tokenizing Prompts (num_proc=128): 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 34730/49172 [00:13<00:05, 2719.08 examples/s] Tokenizing Prompts (num_proc=128): 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 35193/49172 [00:13<00:04, 2910.87 examples/s] Tokenizing Prompts (num_proc=128): 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 35583/49172 [00:13<00:04, 2876.12 examples/s] Tokenizing Prompts (num_proc=128): 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 35965/49172 [00:14<00:04, 2847.46 examples/s] Tokenizing Prompts (num_proc=128): 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 36341/49172 [00:14<00:04, 2829.40 examples/s] Tokenizing Prompts (num_proc=128): 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 36647/49172 [00:14<00:04, 2673.80 examples/s] Tokenizing Prompts (num_proc=128): 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 37103/49172 [00:14<00:04, 2839.03 examples/s] Tokenizing Prompts (num_proc=128): 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 37568/49172 [00:14<00:03, 3009.77 examples/s] Tokenizing Prompts (num_proc=128): 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 37894/49172 [00:14<00:03, 2833.19 examples/s] Tokenizing Prompts (num_proc=128): 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 38273/49172 [00:14<00:03, 2797.05 examples/s] Tokenizing Prompts (num_proc=128): 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 38625/49172 [00:15<00:03, 2713.00 examples/s] Tokenizing Prompts (num_proc=128): 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 39134/49172 [00:15<00:03, 2989.24 examples/s] Tokenizing Prompts (num_proc=128): 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 39484/49172 [00:15<00:03, 2970.93 examples/s] Tokenizing Prompts (num_proc=128): 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 39830/49172 [00:15<00:03, 2822.53 examples/s] Tokenizing Prompts (num_proc=128): 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 40145/49172 [00:15<00:03, 2691.61 examples/s] Tokenizing Prompts (num_proc=128): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 40611/49172 [00:15<00:02, 2919.81 examples/s] Tokenizing Prompts (num_proc=128): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 40933/49172 [00:15<00:02, 2872.22 examples/s] Tokenizing Prompts (num_proc=128): 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 41235/49172 [00:15<00:03, 2550.35 examples/s] Tokenizing Prompts (num_proc=128): 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 41636/49172 [00:16<00:02, 2623.37 examples/s] Tokenizing Prompts (num_proc=128): 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 42011/49172 [00:16<00:02, 2667.31 examples/s] Tokenizing Prompts (num_proc=128): 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 42565/49172 [00:16<00:02, 3151.00 examples/s] Tokenizing Prompts (num_proc=128): 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 42957/49172 [00:16<00:02, 3026.85 examples/s] Tokenizing Prompts (num_proc=128): 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 43325/49172 [00:16<00:02, 2828.92 examples/s] Tokenizing Prompts (num_proc=128): 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 43700/49172 [00:16<00:01, 2829.86 examples/s] Tokenizing Prompts (num_proc=128): 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 44022/49172 [00:16<00:01, 2649.38 examples/s] Tokenizing Prompts (num_proc=128): 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 44422/49172 [00:17<00:01, 2713.96 examples/s] Tokenizing Prompts (num_proc=128): 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 44782/49172 [00:17<00:01, 2703.83 examples/s] Tokenizing Prompts (num_proc=128): 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 45155/49172 [00:17<00:01, 2707.69 examples/s] Tokenizing Prompts (num_proc=128): 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 45628/49172 [00:17<00:01, 2900.02 examples/s] Tokenizing Prompts (num_proc=128): 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 45986/49172 [00:17<00:01, 2826.26 examples/s] Tokenizing Prompts (num_proc=128): 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 46333/49172 [00:17<00:01, 2738.12 examples/s] Tokenizing Prompts (num_proc=128): 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 46787/49172 [00:17<00:00, 3126.02 examples/s] Tokenizing Prompts (num_proc=128): 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 47266/49172 [00:17<00:00, 3503.55 examples/s] Tokenizing Prompts (num_proc=128): 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 47643/49172 [00:18<00:00, 3338.62 examples/s] Tokenizing Prompts (num_proc=128): 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 48031/49172 [00:18<00:00, 3013.54 examples/s] Tokenizing Prompts (num_proc=128): 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 48485/49172 [00:18<00:00, 3381.58 examples/s] Tokenizing Prompts (num_proc=128): 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 48913/49172 [00:18<00:00, 3410.78 examples/s] Tokenizing Prompts (num_proc=128): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 49172/49172 [00:19<00:00, 2564.69 examples/s]
[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:837642] min_input_len: 33
[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:837642] max_input_len: 1051
Dropping Long Sequences (>8192) (num_proc=128): 0%| | 0/49172 [00:00<?, ? examples/s] Dropping Long Sequences (>8192) (num_proc=128): 1%| | 385/49172 [00:00<01:43, 471.65 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 5%|β–Œ | 2695/49172 [00:00<00:12, 3750.47 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 13%|β–ˆβ–Ž | 6545/49172 [00:01<00:04, 9405.16 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 19%|β–ˆβ–‰ | 9236/49172 [00:01<00:03, 12478.76 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 25%|β–ˆβ–ˆβ–Œ | 12308/49172 [00:01<00:02, 15931.93 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 30%|β–ˆβ–ˆβ–ˆ | 14996/49172 [00:01<00:01, 17930.73 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 37%|β–ˆβ–ˆβ–ˆβ–‹ | 18068/49172 [00:01<00:01, 20677.88 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21524/49172 [00:01<00:01, 23483.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 24596/49172 [00:01<00:01, 23335.07 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 29588/49172 [00:01<00:00, 29166.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 33812/49172 [00:01<00:00, 31889.27 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 37268/49172 [00:02<00:00, 32125.51 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 40724/49172 [00:02<00:00, 31182.70 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 45716/49172 [00:02<00:00, 33653.89 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 49172/49172 [00:02<00:00, 18442.91 examples/s]
Saving the dataset (0/128 shards): 0%| | 0/49172 [00:00<?, ? examples/s] Saving the dataset (0/128 shards): 1%| | 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (1/128 shards): 1%| | 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (2/128 shards): 2%|▏ | 770/49172 [00:02<04:25, 182.62 examples/s] Saving the dataset (3/128 shards): 2%|▏ | 1155/49172 [00:02<04:22, 182.62 examples/s] Saving the dataset (4/128 shards): 3%|β–Ž | 1540/49172 [00:02<04:20, 182.62 examples/s] Saving the dataset (5/128 shards): 4%|▍ | 1925/49172 [00:02<04:18, 182.62 examples/s] Saving the dataset (6/128 shards): 5%|▍ | 2310/49172 [00:02<04:16, 182.62 examples/s] Saving the dataset (7/128 shards): 5%|β–Œ | 2695/49172 [00:02<04:14, 182.62 examples/s] Saving the dataset (8/128 shards): 6%|β–‹ | 3080/49172 [00:02<04:12, 182.62 examples/s] Saving the dataset (9/128 shards): 7%|β–‹ | 3465/49172 [00:02<04:10, 182.62 examples/s] Saving the dataset (10/128 shards): 8%|β–Š | 3850/49172 [00:02<04:08, 182.62 examples/s] Saving the dataset (11/128 shards): 9%|β–Š | 4235/49172 [00:02<04:06, 182.62 examples/s] Saving the dataset (12/128 shards): 9%|β–‰ | 4620/49172 [00:02<04:03, 182.62 examples/s] Saving the dataset (13/128 shards): 10%|β–ˆ | 5005/49172 [00:02<04:01, 182.62 examples/s] Saving the dataset (14/128 shards): 11%|β–ˆ | 5390/49172 [00:02<03:59, 182.62 examples/s] Saving the dataset (15/128 shards): 12%|β–ˆβ– | 5775/49172 [00:02<03:57, 182.62 examples/s] Saving the dataset (16/128 shards): 13%|β–ˆβ–Ž | 6160/49172 [00:02<03:55, 182.62 examples/s] Saving the dataset (17/128 shards): 13%|β–ˆβ–Ž | 6545/49172 [00:02<03:53, 182.62 examples/s] Saving the dataset (18/128 shards): 14%|β–ˆβ– | 6930/49172 [00:02<03:51, 182.62 examples/s] Saving the dataset (19/128 shards): 15%|β–ˆβ– | 7315/49172 [00:02<03:49, 182.62 examples/s] Saving the dataset (20/128 shards): 16%|β–ˆβ–Œ | 7700/49172 [00:02<03:47, 182.62 examples/s] Saving the dataset (21/128 shards): 16%|β–ˆβ–‹ | 8084/49172 [00:02<03:44, 182.62 examples/s] Saving the dataset (22/128 shards): 17%|β–ˆβ–‹ | 8468/49172 [00:02<03:42, 182.62 examples/s] Saving the dataset (23/128 shards): 18%|β–ˆβ–Š | 8852/49172 [00:02<03:40, 182.62 examples/s] Saving the dataset (24/128 shards): 19%|β–ˆβ–‰ | 9236/49172 [00:02<03:38, 182.62 examples/s] Saving the dataset (25/128 shards): 20%|β–ˆβ–‰ | 9620/49172 [00:02<03:36, 182.62 examples/s] Saving the dataset (26/128 shards): 20%|β–ˆβ–ˆ | 10004/49172 [00:02<03:34, 182.62 examples/s] Saving the dataset (27/128 shards): 21%|β–ˆβ–ˆ | 10388/49172 [00:02<03:32, 182.62 examples/s] Saving the dataset (28/128 shards): 22%|β–ˆβ–ˆβ– | 10772/49172 [00:02<03:30, 182.62 examples/s] Saving the dataset (29/128 shards): 23%|β–ˆβ–ˆβ–Ž | 11156/49172 [00:02<03:28, 182.62 examples/s] Saving the dataset (30/128 shards): 23%|β–ˆβ–ˆβ–Ž | 11540/49172 [00:02<03:26, 182.62 examples/s] Saving the dataset (31/128 shards): 24%|β–ˆβ–ˆβ– | 11924/49172 [00:02<03:23, 182.62 examples/s] Saving the dataset (32/128 shards): 25%|β–ˆβ–ˆβ–Œ | 12308/49172 [00:02<03:21, 182.62 examples/s] Saving the dataset (33/128 shards): 26%|β–ˆβ–ˆβ–Œ | 12692/49172 [00:02<03:19, 182.62 examples/s] Saving the dataset (34/128 shards): 27%|β–ˆβ–ˆβ–‹ | 13076/49172 [00:02<03:17, 182.62 examples/s] Saving the dataset (35/128 shards): 27%|β–ˆβ–ˆβ–‹ | 13460/49172 [00:02<03:15, 182.62 examples/s] Saving the dataset (36/128 shards): 28%|β–ˆβ–ˆβ–Š | 13844/49172 [00:02<03:13, 182.62 examples/s] Saving the dataset (37/128 shards): 29%|β–ˆβ–ˆβ–‰ | 14228/49172 [00:02<03:11, 182.62 examples/s] Saving the dataset (38/128 shards): 30%|β–ˆβ–ˆβ–‰ | 14612/49172 [00:02<03:09, 182.62 examples/s] Saving the dataset (39/128 shards): 30%|β–ˆβ–ˆβ–ˆ | 14996/49172 [00:02<03:07, 182.62 examples/s] Saving the dataset (40/128 shards): 31%|β–ˆβ–ˆβ–ˆβ– | 15380/49172 [00:02<03:05, 182.62 examples/s] Saving the dataset (41/128 shards): 32%|β–ˆβ–ˆβ–ˆβ– | 15764/49172 [00:02<03:02, 182.62 examples/s] Saving the dataset (42/128 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 16148/49172 [00:02<03:00, 182.62 examples/s] Saving the dataset (43/128 shards): 34%|β–ˆβ–ˆβ–ˆβ–Ž | 16532/49172 [00:02<02:58, 182.62 examples/s] Saving the dataset (44/128 shards): 34%|β–ˆβ–ˆβ–ˆβ– | 16916/49172 [00:02<02:56, 182.62 examples/s] Saving the dataset (45/128 shards): 35%|β–ˆβ–ˆβ–ˆβ–Œ | 17300/49172 [00:02<02:54, 182.62 examples/s] Saving the dataset (46/128 shards): 36%|β–ˆβ–ˆβ–ˆβ–Œ | 17684/49172 [00:02<02:52, 182.62 examples/s] Saving the dataset (47/128 shards): 37%|β–ˆβ–ˆβ–ˆβ–‹ | 18068/49172 [00:02<02:50, 182.62 examples/s] Saving the dataset (48/128 shards): 38%|β–ˆβ–ˆβ–ˆβ–Š | 18452/49172 [00:02<02:48, 182.62 examples/s] Saving the dataset (49/128 shards): 38%|β–ˆβ–ˆβ–ˆβ–Š | 18836/49172 [00:02<02:46, 182.62 examples/s] Saving the dataset (50/128 shards): 39%|β–ˆβ–ˆβ–ˆβ–‰ | 19220/49172 [00:02<02:44, 182.62 examples/s] Saving the dataset (51/128 shards): 40%|β–ˆβ–ˆβ–ˆβ–‰ | 19604/49172 [00:02<02:41, 182.62 examples/s] Saving the dataset (52/128 shards): 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 19988/49172 [00:02<02:39, 182.62 examples/s] Saving the dataset (53/128 shards): 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20372/49172 [00:02<02:37, 182.62 examples/s] Saving the dataset (54/128 shards): 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20756/49172 [00:02<02:35, 182.62 examples/s] Saving the dataset (55/128 shards): 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 21140/49172 [00:02<02:33, 182.62 examples/s] Saving the dataset (56/128 shards): 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21524/49172 [00:02<02:31, 182.62 examples/s] Saving the dataset (57/128 shards): 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21908/49172 [00:02<02:29, 182.62 examples/s] Saving the dataset (58/128 shards): 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 22292/49172 [00:02<02:27, 182.62 examples/s] Saving the dataset (59/128 shards): 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 22676/49172 [00:02<02:25, 182.62 examples/s] Saving the dataset (60/128 shards): 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 23060/49172 [00:02<02:22, 182.62 examples/s] Saving the dataset (61/128 shards): 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 23444/49172 [00:02<02:20, 182.62 examples/s] Saving the dataset (62/128 shards): 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 23828/49172 [00:02<02:18, 182.62 examples/s] Saving the dataset (63/128 shards): 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 24212/49172 [00:02<02:16, 182.62 examples/s] Saving the dataset (64/128 shards): 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 24596/49172 [00:02<02:14, 182.62 examples/s] Saving the dataset (65/128 shards): 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 24980/49172 [00:02<02:12, 182.62 examples/s] Saving the dataset (66/128 shards): 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (67/128 shards): 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (68/128 shards): 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 26132/49172 [00:02<02:06, 182.62 examples/s] Saving the dataset (69/128 shards): 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 26516/49172 [00:02<02:04, 182.62 examples/s] Saving the dataset (70/128 shards): 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 26900/49172 [00:02<02:01, 182.62 examples/s] Saving the dataset (71/128 shards): 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 27284/49172 [00:02<01:59, 182.62 examples/s] Saving the dataset (72/128 shards): 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 27668/49172 [00:02<01:57, 182.62 examples/s] Saving the dataset (73/128 shards): 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 28052/49172 [00:02<01:55, 182.62 examples/s] Saving the dataset (74/128 shards): 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 28436/49172 [00:02<01:53, 182.62 examples/s] Saving the dataset (75/128 shards): 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 28820/49172 [00:02<01:51, 182.62 examples/s] Saving the dataset (76/128 shards): 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 29204/49172 [00:02<01:49, 182.62 examples/s] Saving the dataset (77/128 shards): 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 29588/49172 [00:02<01:47, 182.62 examples/s] Saving the dataset (78/128 shards): 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 29972/49172 [00:02<01:45, 182.62 examples/s] Saving the dataset (79/128 shards): 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 30356/49172 [00:02<01:43, 182.62 examples/s] Saving the dataset (80/128 shards): 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 30740/49172 [00:02<01:40, 182.62 examples/s] Saving the dataset (81/128 shards): 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 31124/49172 [00:02<01:38, 182.62 examples/s] Saving the dataset (82/128 shards): 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31508/49172 [00:02<01:36, 182.62 examples/s] Saving the dataset (83/128 shards): 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31892/49172 [00:02<01:34, 182.62 examples/s] Saving the dataset (84/128 shards): 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 32276/49172 [00:02<01:32, 182.62 examples/s] Saving the dataset (85/128 shards): 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 32660/49172 [00:02<01:30, 182.62 examples/s] Saving the dataset (86/128 shards): 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33044/49172 [00:02<01:28, 182.62 examples/s] Saving the dataset (87/128 shards): 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 33428/49172 [00:02<01:26, 182.62 examples/s] Saving the dataset (88/128 shards): 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 33812/49172 [00:02<01:24, 182.62 examples/s] Saving the dataset (89/128 shards): 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (90/128 shards): 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (91/128 shards): 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 34964/49172 [00:02<01:17, 182.62 examples/s] Saving the dataset (92/128 shards): 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 35348/49172 [00:02<01:15, 182.62 examples/s] Saving the dataset (93/128 shards): 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 35732/49172 [00:02<01:13, 182.62 examples/s] Saving the dataset (94/128 shards): 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 36116/49172 [00:02<01:11, 182.62 examples/s] Saving the dataset (95/128 shards): 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 36500/49172 [00:02<01:09, 182.62 examples/s] Saving the dataset (96/128 shards): 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 36884/49172 [00:02<01:07, 182.62 examples/s] Saving the dataset (97/128 shards): 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 37268/49172 [00:02<01:05, 182.62 examples/s] Saving the dataset (98/128 shards): 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (99/128 shards): 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (100/128 shards): 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 38420/49172 [00:02<00:58, 182.62 examples/s] Saving the dataset (101/128 shards): 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 38804/49172 [00:02<00:56, 182.62 examples/s] Saving the dataset (102/128 shards): 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 39188/49172 [00:02<00:54, 182.62 examples/s] Saving the dataset (103/128 shards): 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 39572/49172 [00:02<00:52, 182.62 examples/s] Saving the dataset (104/128 shards): 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 39956/49172 [00:02<00:50, 182.62 examples/s] Saving the dataset (105/128 shards): 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 40340/49172 [00:02<00:48, 182.62 examples/s] Saving the dataset (106/128 shards): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 40724/49172 [00:02<00:46, 182.62 examples/s] Saving the dataset (107/128 shards): 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 41108/49172 [00:02<00:44, 182.62 examples/s] Saving the dataset (108/128 shards): 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 41492/49172 [00:02<00:42, 182.62 examples/s] Saving the dataset (109/128 shards): 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 41876/49172 [00:02<00:39, 182.62 examples/s] Saving the dataset (110/128 shards): 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 42260/49172 [00:02<00:37, 182.62 examples/s] Saving the dataset (111/128 shards): 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 42644/49172 [00:02<00:35, 182.62 examples/s] Saving the dataset (112/128 shards): 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 43028/49172 [00:02<00:33, 182.62 examples/s] Saving the dataset (113/128 shards): 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 43412/49172 [00:02<00:31, 182.62 examples/s] Saving the dataset (114/128 shards): 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 43796/49172 [00:02<00:29, 182.62 examples/s] Saving the dataset (115/128 shards): 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 44180/49172 [00:02<00:27, 182.62 examples/s] Saving the dataset (116/128 shards): 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (117/128 shards): 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (118/128 shards): 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 45332/49172 [00:02<00:21, 182.62 examples/s] Saving the dataset (119/128 shards): 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 45716/49172 [00:02<00:18, 182.62 examples/s] Saving the dataset (120/128 shards): 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 46100/49172 [00:02<00:16, 182.62 examples/s] Saving the dataset (121/128 shards): 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 46484/49172 [00:02<00:14, 182.62 examples/s] Saving the dataset (122/128 shards): 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 46868/49172 [00:02<00:12, 182.62 examples/s] Saving the dataset (123/128 shards): 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 47252/49172 [00:02<00:10, 182.62 examples/s] Saving the dataset (124/128 shards): 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 47636/49172 [00:02<00:08, 182.62 examples/s] Saving the dataset (125/128 shards): 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 48020/49172 [00:02<00:06, 182.62 examples/s] Saving the dataset (126/128 shards): 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 48404/49172 [00:02<00:04, 182.62 examples/s] Saving the dataset (127/128 shards): 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 48788/49172 [00:02<00:02, 182.62 examples/s] Saving the dataset (128/128 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 49172/49172 [00:02<00:00, 182.62 examples/s] Saving the dataset (128/128 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 49172/49172 [00:02<00:00, 22251.27 examples/s]
[2025-11-06 16:12:34,233] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:837642] total_num_tokens: 9_208_425
[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:837642] `total_supervised_tokens: 6_847_432`
[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:837642] total_num_steps: 769
[2025-11-06 16:12:34,425] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:837642] Maximum number of steps set at 769
[2025-11-06 16:12:34,441] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:837642] Loading tokenizer... meta-llama/Llama-3.2-3B
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
[2025-11-06 16:12:35,271] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:837642] Loading model
[2025-11-06 16:12:35,502] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:837642] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-11-06 16:12:35,503] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:837642] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-11-06 16:12:35,531] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:837642] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 86.01it/s]
[2025-11-06 16:15:46,317] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:837642] Converting modules to torch.bfloat16
[2025-11-06 16:15:59,472] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:837642] Memory usage after model load 0.000GB ()
[2025-11-06 16:16:00,600] [WARNING] [accelerate.utils.other.check_os_kernel:512] [PID:837642] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[2025-11-06 16:16:07,189] [INFO] [axolotl.train.save_initial_configs:412] [PID:837642] Pre-saving tokenizer to ./outputs/qat_out/...
[2025-11-06 16:16:07,288] [INFO] [axolotl.train.save_initial_configs:417] [PID:837642] Pre-saving model config to ./outputs/qat_out/...
[2025-11-06 16:16:07,290] [INFO] [axolotl.train.execute_training:203] [PID:837642] Starting trainer...
0%| | 0/769 [00:00<?, ?it/s] 0%| | 1/769 [00:10<2:09:18, 10.10s/it] {'loss': 1.1473, 'grad_norm': 4.625, 'learning_rate': 0.0, 'memory/max_active (GiB)': 34.78, 'memory/max_allocated (GiB)': 34.78, 'memory/device_reserved (GiB)': 41.06, 'tokens_per_second_per_gpu': 879.01, 'epoch': 0.0}
0%| | 1/769 [00:10<2:09:18, 10.10s/it][2025-11-06 16:16:17,683] [INFO] [axolotl.core.trainers.base._save:671] [PID:837642] Saving model checkpoint to ./outputs/qat_out/checkpoint-1
0%| | 2/769 [00:28<3:13:22, 15.13s/it] {'loss': 1.1048, 'grad_norm': 4.34375, 'learning_rate': 2.6315789473684213e-07, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 55.05, 'tokens_per_second_per_gpu': 1653.05, 'epoch': 0.0}
0%| | 2/769 [00:28<3:13:22, 15.13s/it] 0%| | 3/769 [00:35<2:23:03, 11.21s/it] {'loss': 1.1442, 'grad_norm': 4.6875, 'learning_rate': 5.263157894736843e-07, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1475.08, 'epoch': 0.0}
0%| | 3/769 [00:35<2:23:03, 11.21s/it] 1%| | 4/769 [00:40<1:55:11, 9.03s/it] {'loss': 1.1473, 'grad_norm': 3.671875, 'learning_rate': 7.894736842105263e-07, 'memory/max_active (GiB)': 46.8, 'memory/max_allocated (GiB)': 46.8, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 2080.31, 'epoch': 0.01}
1%| | 4/769 [00:41<1:55:11, 9.03s/it] 1%| | 5/769 [00:46<1:39:41, 7.83s/it] {'loss': 1.1704, 'grad_norm': 4.15625, 'learning_rate': 1.0526315789473685e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1673.31, 'epoch': 0.01}
1%| | 5/769 [00:46<1:39:41, 7.83s/it] 1%| | 6/769 [00:51<1:28:32, 6.96s/it] {'loss': 1.1557, 'grad_norm': 4.09375, 'learning_rate': 1.3157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1894.62, 'epoch': 0.01}
1%| | 6/769 [00:51<1:28:32, 6.96s/it] 1%| | 7/769 [00:57<1:20:49, 6.36s/it] {'loss': 1.1819, 'grad_norm': 5.21875, 'learning_rate': 1.5789473684210526e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.51, 'tokens_per_second_per_gpu': 1854.54, 'epoch': 0.01}
1%| | 7/769 [00:57<1:20:49, 6.36s/it] 1%| | 8/769 [01:02<1:18:07, 6.16s/it] {'loss': 1.1807, 'grad_norm': 4.125, 'learning_rate': 1.8421052631578948e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1751.39, 'epoch': 0.01}
1%| | 8/769 [01:02<1:18:07, 6.16s/it] 1%| | 9/769 [01:08<1:16:07, 6.01s/it] {'loss': 1.1324, 'grad_norm': 4.40625, 'learning_rate': 2.105263157894737e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1412.84, 'epoch': 0.01}
1%| | 9/769 [01:08<1:16:07, 6.01s/it] 1%|▏ | 10/769 [01:13<1:12:34, 5.74s/it] {'loss': 1.1556, 'grad_norm': 3.78125, 'learning_rate': 2.368421052631579e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1849.28, 'epoch': 0.01}
1%|▏ | 10/769 [01:13<1:12:34, 5.74s/it] 1%|▏ | 11/769 [01:19<1:12:17, 5.72s/it] {'loss': 1.1855, 'grad_norm': 3.859375, 'learning_rate': 2.631578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1624.94, 'epoch': 0.01}
1%|▏ | 11/769 [01:19<1:12:17, 5.72s/it] 2%|▏ | 12/769 [01:24<1:12:03, 5.71s/it] {'loss': 1.1024, 'grad_norm': 4.09375, 'learning_rate': 2.8947368421052634e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1463.62, 'epoch': 0.02}
2%|▏ | 12/769 [01:25<1:12:03, 5.71s/it] 2%|▏ | 13/769 [01:29<1:08:10, 5.41s/it] {'loss': 1.1591, 'grad_norm': 3.765625, 'learning_rate': 3.157894736842105e-06, 'memory/max_active (GiB)': 41.5, 'memory/max_allocated (GiB)': 41.5, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1910.89, 'epoch': 0.02}
2%|▏ | 13/769 [01:29<1:08:10, 5.41s/it] 2%|▏ | 14/769 [01:35<1:09:08, 5.49s/it] {'loss': 1.0844, 'grad_norm': 3.484375, 'learning_rate': 3.421052631578948e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1588.42, 'epoch': 0.02}
2%|▏ | 14/769 [01:35<1:09:08, 5.49s/it] 2%|▏ | 15/769 [01:39<1:05:28, 5.21s/it] {'loss': 1.2444, 'grad_norm': 3.984375, 'learning_rate': 3.6842105263157896e-06, 'memory/max_active (GiB)': 41.46, 'memory/max_allocated (GiB)': 41.46, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1568.51, 'epoch': 0.02}
2%|▏ | 15/769 [01:39<1:05:28, 5.21s/it] 2%|▏ | 16/769 [01:45<1:07:09, 5.35s/it] {'loss': 1.1609, 'grad_norm': 3.578125, 'learning_rate': 3.947368421052632e-06, 'memory/max_active (GiB)': 46.74, 'memory/max_allocated (GiB)': 46.74, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1338.16, 'epoch': 0.02}
2%|▏ | 16/769 [01:45<1:07:09, 5.35s/it] 2%|▏ | 17/769 [01:51<1:08:21, 5.45s/it] {'loss': 1.099, 'grad_norm': 3.125, 'learning_rate': 4.210526315789474e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1704.75, 'epoch': 0.02}
2%|▏ | 17/769 [01:51<1:08:21, 5.45s/it] 2%|▏ | 18/769 [01:57<1:09:09, 5.53s/it] {'loss': 1.0828, 'grad_norm': 3.046875, 'learning_rate': 4.473684210526316e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1664.81, 'epoch': 0.02}
2%|▏ | 18/769 [01:57<1:09:09, 5.53s/it] 2%|▏ | 19/769 [02:02<1:07:39, 5.41s/it] {'loss': 1.2319, 'grad_norm': 3.4375, 'learning_rate': 4.736842105263158e-06, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1602.68, 'epoch': 0.02}
2%|▏ | 19/769 [02:02<1:07:39, 5.41s/it] 3%|β–Ž | 20/769 [02:07<1:08:44, 5.51s/it] {'loss': 1.0638, 'grad_norm': 2.953125, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.13, 'tokens_per_second_per_gpu': 1755.41, 'epoch': 0.03}
3%|β–Ž | 20/769 [02:07<1:08:44, 5.51s/it] 3%|β–Ž | 21/769 [02:13<1:09:22, 5.56s/it] {'loss': 1.2072, 'grad_norm': 2.859375, 'learning_rate': 5.263157894736842e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1833.74, 'epoch': 0.03}
3%|β–Ž | 21/769 [02:13<1:09:22, 5.56s/it] 3%|β–Ž | 22/769 [02:18<1:07:37, 5.43s/it] {'loss': 1.2006, 'grad_norm': 3.109375, 'learning_rate': 5.526315789473685e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1567.66, 'epoch': 0.03}
3%|β–Ž | 22/769 [02:18<1:07:37, 5.43s/it] 3%|β–Ž | 23/769 [02:25<1:12:34, 5.84s/it] {'loss': 1.0081, 'grad_norm': 2.625, 'learning_rate': 5.789473684210527e-06, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1412.25, 'epoch': 0.03}
3%|β–Ž | 23/769 [02:25<1:12:34, 5.84s/it] 3%|β–Ž | 24/769 [02:31<1:12:04, 5.80s/it] {'loss': 1.1437, 'grad_norm': 2.6875, 'learning_rate': 6.0526315789473685e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.26, 'tokens_per_second_per_gpu': 1749.27, 'epoch': 0.03}
3%|β–Ž | 24/769 [02:31<1:12:04, 5.80s/it] 3%|β–Ž | 25/769 [02:36<1:11:33, 5.77s/it] {'loss': 1.0987, 'grad_norm': 2.765625, 'learning_rate': 6.31578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1627.64, 'epoch': 0.03}
3%|β–Ž | 25/769 [02:36<1:11:33, 5.77s/it] 3%|β–Ž | 26/769 [02:42<1:09:06, 5.58s/it] {'loss': 1.1046, 'grad_norm': 2.6875, 'learning_rate': 6.578947368421054e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1967.95, 'epoch': 0.03}
3%|β–Ž | 26/769 [02:42<1:09:06, 5.58s/it] 4%|β–Ž | 27/769 [02:47<1:09:27, 5.62s/it] {'loss': 1.1072, 'grad_norm': 2.8125, 'learning_rate': 6.842105263157896e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1594.62, 'epoch': 0.04}
4%|β–Ž | 27/769 [02:47<1:09:27, 5.62s/it] 4%|β–Ž | 28/769 [02:52<1:05:29, 5.30s/it] {'loss': 1.1974, 'grad_norm': 3.03125, 'learning_rate': 7.1052631578947375e-06, 'memory/max_active (GiB)': 41.47, 'memory/max_allocated (GiB)': 41.47, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1766.49, 'epoch': 0.04}
4%|β–Ž | 28/769 [02:52<1:05:29, 5.30s/it] 4%|▍ | 29/769 [02:58<1:06:52, 5.42s/it] {'loss': 1.2164, 'grad_norm': 2.734375, 'learning_rate': 7.368421052631579e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1593.41, 'epoch': 0.04}
4%|▍ | 29/769 [02:58<1:06:52, 5.42s/it] 4%|▍ | 30/769 [03:03<1:07:46, 5.50s/it] {'loss': 1.1324, 'grad_norm': 2.65625, 'learning_rate': 7.631578947368423e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1607.66, 'epoch': 0.04}
4%|▍ | 30/769 [03:03<1:07:46, 5.50s/it] 4%|▍ | 31/769 [03:09<1:08:21, 5.56s/it] {'loss': 1.0693, 'grad_norm': 2.96875, 'learning_rate': 7.894736842105265e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1454.28, 'epoch': 0.04}
4%|▍ | 31/769 [03:09<1:08:21, 5.56s/it] 4%|▍ | 32/769 [03:14<1:06:43, 5.43s/it] {'loss': 1.0988, 'grad_norm': 2.65625, 'learning_rate': 8.157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1972.66, 'epoch': 0.04}
4%|▍ | 32/769 [03:14<1:06:43, 5.43s/it] 4%|▍ | 33/769 [03:19<1:03:27, 5.17s/it] {'loss': 1.074, 'grad_norm': 2.84375, 'learning_rate': 8.421052631578948e-06, 'memory/max_active (GiB)': 41.48, 'memory/max_allocated (GiB)': 41.48, 'memory/device_reserved (GiB)': 77.13, 'tokens_per_second_per_gpu': 1705.8, 'epoch': 0.04}
4%|▍ | 33/769 [03:19<1:03:27, 5.17s/it] 4%|▍ | 34/769 [03:24<1:03:14, 5.16s/it] {'loss': 1.1673, 'grad_norm': 2.671875, 'learning_rate': 8.68421052631579e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 76.88, 'tokens_per_second_per_gpu': 1761.02, 'epoch': 0.04}
4%|▍ | 34/769 [03:24<1:03:14, 5.16s/it] 5%|▍ | 35/769 [03:29<1:02:59, 5.15s/it] {'loss': 1.1655, 'grad_norm': 3.03125, 'learning_rate': 8.947368421052632e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 76.01, 'tokens_per_second_per_gpu': 1479.42, 'epoch': 0.05}
5%|▍ | 35/769 [03:29<1:02:59, 5.15s/it] 5%|▍ | 36/769 [03:35<1:04:53, 5.31s/it] {'loss': 1.1483, 'grad_norm': 2.71875, 'learning_rate': 9.210526315789474e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1466.59, 'epoch': 0.05}
5%|▍ | 36/769 [03:35<1:04:53, 5.31s/it] 5%|▍ | 37/769 [03:40<1:04:09, 5.26s/it] {'loss': 1.159, 'grad_norm': 2.578125, 'learning_rate': 9.473684210526315e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1806.68, 'epoch': 0.05}
5%|▍ | 37/769 [03:40<1:04:09, 5.26s/it] 5%|▍ | 38/769 [03:45<1:05:41, 5.39s/it] {'loss': 1.1035, 'grad_norm': 2.625, 'learning_rate': 9.736842105263159e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1689.12, 'epoch': 0.05}
5%|▍ | 38/769 [03:45<1:05:41, 5.39s/it] 5%|β–Œ | 39/769 [03:51<1:06:41, 5.48s/it] {'loss': 1.0471, 'grad_norm': 2.609375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1579.8, 'epoch': 0.05}
5%|β–Œ | 39/769 [03:51<1:06:41, 5.48s/it] 5%|β–Œ | 40/769 [03:56<1:05:20, 5.38s/it] {'loss': 1.2091, 'grad_norm': 2.96875, 'learning_rate': 1.0263157894736844e-05, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1542.1, 'epoch': 0.05}
5%|β–Œ | 40/769 [03:56<1:05:20, 5.38s/it] 5%|β–Œ | 41/769 [04:02<1:06:25, 5.47s/it] {'loss': 1.0722, 'grad_norm': 2.59375, 'learning_rate': 1.0526315789473684e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1585.53, 'epoch': 0.05}
5%|β–Œ | 41/769 [04:02<1:06:25, 5.47s/it]