3b-qat-nvfp4 / debug.log

Training in progress, epoch 0

d3fcddb verified 4 months ago

55.7 kB

	[2025-11-06 16:11:41,516] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:837642] baseline 0.000GB ()
	[2025-11-06 16:11:41,516] [INFO] [axolotl.cli.config.load_cfg:248] [PID:837642] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "3b-qat-nvfp4.yaml",
	"base_model": "meta-llama/Llama-3.2-3B",
	"base_model_config": "meta-llama/Llama-3.2-3B",
	"batch_size": 64,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_90",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"cosine_constant_lr_ratio": 0.0,
	"cosine_min_lr_ratio": 1.0,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_prepared_path": "./outputs/dataset_prepared",
	"dataset_processes": 128,
	"datasets": [
	{
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "yahma/alpaca-cleaned",
	"split": "train[:95%]",
	"trust_remote_code": false,
	"type": "alpaca"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.8.0"
	},
	"eval_batch_size": 64,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_table_size": 0,
	"evals_per_epoch": 1,
	"experimental_skip_move_to_device": true,
	"flash_attention": true,
	"fp16": false,
	"gradient_accumulation_steps": 1,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": true
	},
	"hub_model_id": "AlexHung29629/3b-qat-nvfp4",
	"include_tkps": true,
	"is_llama_derived_model": true,
	"learning_rate": 2e-05,
	"liger_fused_linear_cross_entropy": true,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"mean_resizing_embeddings": false,
	"micro_batch_size": 64,
	"model_config_type": "llama",
	"num_epochs": 1.0,
	"optimizer": "adamw_torch_fused",
	"output_dir": "./outputs/qat_out/",
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin"
	],
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qat": {
	"activation_dtype": "TorchAOQuantDType.nvfp4",
	"group_size": 16,
	"quantize_embedding": false,
	"weight_dtype": "TorchAOQuantDType.nvfp4"
	},
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_first_step": true,
	"save_only_model": true,
	"save_safetensors": true,
	"saves_per_epoch": 1,
	"sequence_len": 8192,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"special_tokens": {
	"pad_token": "<\|finetune_right_pad_id\|>"
	},
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "meta-llama/Llama-3.2-3B",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_ray": false,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"warmup_ratio": 0.1,
	"weight_decay": 0.0,
	"world_size": 1
	}
	[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <\|end_of_text\|>
	[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <\|begin_of_text\|>
	[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <\|finetune_right_pad_id\|>
	[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
	[2025-11-06 16:11:46,489] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
	[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:837642] Unable to find prepared dataset in outputs/dataset_prepared/9bc662aed65b76546b2d635b3957a343
	[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:837642] Loading raw datasets...
	[2025-11-06 16:11:46,492] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:837642] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	Generating train split: 0%\| \| 0/51760 [00:00<?, ? examples/s] Generating train split: 100%\|██████████\| 51760/51760 [00:00<00:00, 229845.07 examples/s] Generating train split: 100%\|██████████\| 51760/51760 [00:00<00:00, 228708.96 examples/s]
	[2025-11-06 16:12:08,982] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:837642] Loading dataset: yahma/alpaca-cleaned with base_type: alpaca and prompt_style: None
	Tokenizing Prompts (num_proc=128): 0%\| \| 0/49172 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=128): 0%\| \| 84/49172 [00:01<13:38, 59.94 examples/s] Tokenizing Prompts (num_proc=128): 0%\| \| 176/49172 [00:01<06:01, 135.38 examples/s] Tokenizing Prompts (num_proc=128): 2%\|▏ \| 792/49172 [00:01<01:05, 743.93 examples/s] Tokenizing Prompts (num_proc=128): 3%\|▎ \| 1289/49172 [00:01<00:39, 1224.84 examples/s] Tokenizing Prompts (num_proc=128): 3%\|▎ \| 1663/49172 [00:01<00:30, 1550.01 examples/s] Tokenizing Prompts (num_proc=128): 4%\|▍ \| 2053/49172 [00:02<00:25, 1838.37 examples/s] Tokenizing Prompts (num_proc=128): 5%\|▌ \| 2462/49172 [00:02<00:22, 2120.82 examples/s] Tokenizing Prompts (num_proc=128): 6%\|▌ \| 2931/49172 [00:02<00:17, 2593.78 examples/s] Tokenizing Prompts (num_proc=128): 7%\|▋ \| 3316/49172 [00:02<00:17, 2658.78 examples/s] Tokenizing Prompts (num_proc=128): 8%\|▊ \| 3704/49172 [00:02<00:16, 2738.71 examples/s] Tokenizing Prompts (num_proc=128): 8%\|▊ \| 4072/49172 [00:02<00:16, 2765.32 examples/s] Tokenizing Prompts (num_proc=128): 9%\|▉ \| 4392/49172 [00:02<00:16, 2672.66 examples/s] Tokenizing Prompts (num_proc=128): 10%\|▉ \| 4715/49172 [00:03<00:17, 2538.79 examples/s] Tokenizing Prompts (num_proc=128): 10%\|█ \| 5094/49172 [00:03<00:17, 2586.15 examples/s] Tokenizing Prompts (num_proc=128): 11%\|█ \| 5504/49172 [00:03<00:16, 2692.82 examples/s] Tokenizing Prompts (num_proc=128): 12%\|█▏ \| 5910/49172 [00:03<00:15, 2763.20 examples/s] Tokenizing Prompts (num_proc=128): 13%\|█▎ \| 6311/49172 [00:03<00:14, 2975.07 examples/s] Tokenizing Prompts (num_proc=128): 14%\|█▎ \| 6668/49172 [00:03<00:14, 2859.49 examples/s] Tokenizing Prompts (num_proc=128): 14%\|█▍ \| 7058/49172 [00:03<00:14, 2844.49 examples/s] Tokenizing Prompts (num_proc=128): 15%\|█▌ \| 7385/49172 [00:03<00:15, 2728.62 examples/s] Tokenizing Prompts (num_proc=128): 16%\|█▌ \| 7836/49172 [00:04<00:13, 3015.74 examples/s] Tokenizing Prompts (num_proc=128): 17%\|█▋ \| 8193/49172 [00:04<00:14, 2873.13 examples/s] Tokenizing Prompts (num_proc=128): 17%\|█▋ \| 8537/49172 [00:04<00:14, 2768.24 examples/s] Tokenizing Prompts (num_proc=128): 18%\|█▊ \| 8959/49172 [00:04<00:13, 2885.47 examples/s] Tokenizing Prompts (num_proc=128): 19%\|█▉ \| 9339/49172 [00:04<00:14, 2805.07 examples/s] Tokenizing Prompts (num_proc=128): 20%\|█▉ \| 9755/49172 [00:04<00:13, 2846.53 examples/s] Tokenizing Prompts (num_proc=128): 21%\|██ \| 10145/49172 [00:04<00:13, 2839.00 examples/s] Tokenizing Prompts (num_proc=128): 22%\|██▏ \| 10628/49172 [00:05<00:12, 3040.97 examples/s] Tokenizing Prompts (num_proc=128): 22%\|██▏ \| 11039/49172 [00:05<00:11, 3253.40 examples/s] Tokenizing Prompts (num_proc=128): 23%\|██▎ \| 11385/49172 [00:05<00:12, 2982.44 examples/s] Tokenizing Prompts (num_proc=128): 24%\|██▍ \| 11692/49172 [00:05<00:13, 2756.63 examples/s] Tokenizing Prompts (num_proc=128): 25%\|██▍ \| 12063/49172 [00:05<00:13, 2720.60 examples/s] Tokenizing Prompts (num_proc=128): 25%\|██▌ \| 12463/49172 [00:05<00:13, 2810.95 examples/s] Tokenizing Prompts (num_proc=128): 26%\|██▌ \| 12850/49172 [00:05<00:12, 2819.15 examples/s] Tokenizing Prompts (num_proc=128): 27%\|██▋ \| 13292/49172 [00:06<00:12, 2922.32 examples/s] Tokenizing Prompts (num_proc=128): 28%\|██▊ \| 13600/49172 [00:06<00:13, 2700.65 examples/s] Tokenizing Prompts (num_proc=128): 28%\|██▊ \| 13989/49172 [00:06<00:12, 2716.86 examples/s] Tokenizing Prompts (num_proc=128): 29%\|██▉ \| 14475/49172 [00:06<00:12, 2852.98 examples/s] Tokenizing Prompts (num_proc=128): 30%\|███ \| 14914/49172 [00:06<00:11, 2938.70 examples/s] Tokenizing Prompts (num_proc=128): 31%\|███▏ \| 15412/49172 [00:06<00:10, 3223.11 examples/s] Tokenizing Prompts (num_proc=128): 32%\|███▏ \| 15748/49172 [00:06<00:11, 3025.74 examples/s] Tokenizing Prompts (num_proc=128): 33%\|███▎ \| 16102/49172 [00:06<00:11, 2858.09 examples/s] Tokenizing Prompts (num_proc=128): 33%\|███▎ \| 16438/49172 [00:07<00:12, 2720.40 examples/s] Tokenizing Prompts (num_proc=128): 34%\|███▍ \| 16767/49172 [00:07<00:12, 2595.58 examples/s] Tokenizing Prompts (num_proc=128): 35%\|███▍ \| 17149/49172 [00:07<00:12, 2665.96 examples/s] Tokenizing Prompts (num_proc=128): 36%\|███▌ \| 17516/49172 [00:07<00:11, 2643.92 examples/s] Tokenizing Prompts (num_proc=128): 37%\|███▋ \| 17961/49172 [00:07<00:10, 2849.24 examples/s] Tokenizing Prompts (num_proc=128): 37%\|███▋ \| 18279/49172 [00:07<00:10, 2822.57 examples/s] Tokenizing Prompts (num_proc=128): 38%\|███▊ \| 18574/49172 [00:07<00:11, 2591.21 examples/s] Tokenizing Prompts (num_proc=128): 39%\|███▊ \| 18957/49172 [00:08<00:11, 2625.56 examples/s] Tokenizing Prompts (num_proc=128): 40%\|███▉ \| 19438/49172 [00:08<00:10, 2894.61 examples/s] Tokenizing Prompts (num_proc=128): 40%\|████ \| 19776/49172 [00:08<00:10, 2741.27 examples/s] Tokenizing Prompts (num_proc=128): 41%\|████ \| 20133/49172 [00:08<00:10, 2676.71 examples/s] Tokenizing Prompts (num_proc=128): 42%\|████▏ \| 20582/49172 [00:08<00:10, 2813.67 examples/s] Tokenizing Prompts (num_proc=128): 43%\|████▎ \| 21056/49172 [00:08<00:09, 2991.88 examples/s] Tokenizing Prompts (num_proc=128): 44%\|████▎ \| 21507/49172 [00:08<00:08, 3120.60 examples/s] Tokenizing Prompts (num_proc=128): 44%\|████▍ \| 21844/49172 [00:09<00:09, 2927.37 examples/s] Tokenizing Prompts (num_proc=128): 45%\|████▌ \| 22214/49172 [00:09<00:09, 2980.95 examples/s] Tokenizing Prompts (num_proc=128): 46%\|████▌ \| 22524/49172 [00:09<00:09, 2740.32 examples/s] Tokenizing Prompts (num_proc=128): 46%\|████▋ \| 22828/49172 [00:09<00:09, 2639.81 examples/s] Tokenizing Prompts (num_proc=128): 47%\|████▋ \| 23146/49172 [00:09<00:10, 2510.35 examples/s] Tokenizing Prompts (num_proc=128): 48%\|████▊ \| 23619/49172 [00:09<00:09, 2763.04 examples/s] Tokenizing Prompts (num_proc=128): 49%\|████▉ \| 24069/49172 [00:09<00:08, 2964.79 examples/s] Tokenizing Prompts (num_proc=128): 50%\|████▉ \| 24455/49172 [00:09<00:08, 2924.79 examples/s] Tokenizing Prompts (num_proc=128): 50%\|█████ \| 24809/49172 [00:10<00:08, 2856.24 examples/s] Tokenizing Prompts (num_proc=128): 51%\|█████ \| 25112/49172 [00:10<00:09, 2621.31 examples/s] Tokenizing Prompts (num_proc=128): 52%\|█████▏ \| 25522/49172 [00:10<00:08, 2714.48 examples/s] Tokenizing Prompts (num_proc=128): 53%\|█████▎ \| 25873/49172 [00:10<00:08, 2705.01 examples/s] Tokenizing Prompts (num_proc=128): 54%\|█████▎ \| 26322/49172 [00:10<00:07, 2907.63 examples/s] Tokenizing Prompts (num_proc=128): 54%\|█████▍ \| 26671/49172 [00:10<00:08, 2787.83 examples/s] Tokenizing Prompts (num_proc=128): 55%\|█████▌ \| 27127/49172 [00:10<00:07, 2923.69 examples/s] Tokenizing Prompts (num_proc=128): 56%\|█████▌ \| 27506/49172 [00:11<00:07, 2929.98 examples/s] Tokenizing Prompts (num_proc=128): 57%\|█████▋ \| 27928/49172 [00:11<00:07, 2972.71 examples/s] Tokenizing Prompts (num_proc=128): 58%\|█████▊ \| 28285/49172 [00:11<00:06, 3017.41 examples/s] Tokenizing Prompts (num_proc=128): 58%\|█████▊ \| 28589/49172 [00:11<00:07, 2769.13 examples/s] Tokenizing Prompts (num_proc=128): 59%\|█████▉ \| 28960/49172 [00:11<00:07, 2746.25 examples/s] Tokenizing Prompts (num_proc=128): 60%\|█████▉ \| 29343/49172 [00:11<00:07, 2735.06 examples/s] Tokenizing Prompts (num_proc=128): 60%\|██████ \| 29730/49172 [00:11<00:07, 2777.41 examples/s] Tokenizing Prompts (num_proc=128): 61%\|██████▏ \| 30201/49172 [00:11<00:06, 3027.86 examples/s] Tokenizing Prompts (num_proc=128): 62%\|██████▏ \| 30511/49172 [00:12<00:06, 2794.02 examples/s] Tokenizing Prompts (num_proc=128): 63%\|██████▎ \| 30963/49172 [00:12<00:06, 2860.65 examples/s] Tokenizing Prompts (num_proc=128): 64%\|██████▎ \| 31260/49172 [00:12<00:06, 2659.00 examples/s] Tokenizing Prompts (num_proc=128): 64%\|██████▍ \| 31661/49172 [00:12<00:06, 2744.06 examples/s] Tokenizing Prompts (num_proc=128): 65%\|██████▌ \| 32103/49172 [00:12<00:05, 2886.04 examples/s] Tokenizing Prompts (num_proc=128): 66%\|██████▌ \| 32523/49172 [00:12<00:05, 2982.93 examples/s] Tokenizing Prompts (num_proc=128): 67%\|██████▋ \| 32894/49172 [00:12<00:05, 2895.08 examples/s] Tokenizing Prompts (num_proc=128): 68%\|██████▊ \| 33277/49172 [00:13<00:05, 3037.84 examples/s] Tokenizing Prompts (num_proc=128): 68%\|██████▊ \| 33649/49172 [00:13<00:05, 2842.83 examples/s] Tokenizing Prompts (num_proc=128): 69%\|██████▉ \| 33971/49172 [00:13<00:05, 2690.27 examples/s] Tokenizing Prompts (num_proc=128): 70%\|██████▉ \| 34337/49172 [00:13<00:05, 2695.46 examples/s] Tokenizing Prompts (num_proc=128): 71%\|███████ \| 34730/49172 [00:13<00:05, 2719.08 examples/s] Tokenizing Prompts (num_proc=128): 72%\|███████▏ \| 35193/49172 [00:13<00:04, 2910.87 examples/s] Tokenizing Prompts (num_proc=128): 72%\|███████▏ \| 35583/49172 [00:13<00:04, 2876.12 examples/s] Tokenizing Prompts (num_proc=128): 73%\|███████▎ \| 35965/49172 [00:14<00:04, 2847.46 examples/s] Tokenizing Prompts (num_proc=128): 74%\|███████▍ \| 36341/49172 [00:14<00:04, 2829.40 examples/s] Tokenizing Prompts (num_proc=128): 75%\|███████▍ \| 36647/49172 [00:14<00:04, 2673.80 examples/s] Tokenizing Prompts (num_proc=128): 75%\|███████▌ \| 37103/49172 [00:14<00:04, 2839.03 examples/s] Tokenizing Prompts (num_proc=128): 76%\|███████▋ \| 37568/49172 [00:14<00:03, 3009.77 examples/s] Tokenizing Prompts (num_proc=128): 77%\|███████▋ \| 37894/49172 [00:14<00:03, 2833.19 examples/s] Tokenizing Prompts (num_proc=128): 78%\|███████▊ \| 38273/49172 [00:14<00:03, 2797.05 examples/s] Tokenizing Prompts (num_proc=128): 79%\|███████▊ \| 38625/49172 [00:15<00:03, 2713.00 examples/s] Tokenizing Prompts (num_proc=128): 80%\|███████▉ \| 39134/49172 [00:15<00:03, 2989.24 examples/s] Tokenizing Prompts (num_proc=128): 80%\|████████ \| 39484/49172 [00:15<00:03, 2970.93 examples/s] Tokenizing Prompts (num_proc=128): 81%\|████████ \| 39830/49172 [00:15<00:03, 2822.53 examples/s] Tokenizing Prompts (num_proc=128): 82%\|████████▏ \| 40145/49172 [00:15<00:03, 2691.61 examples/s] Tokenizing Prompts (num_proc=128): 83%\|████████▎ \| 40611/49172 [00:15<00:02, 2919.81 examples/s] Tokenizing Prompts (num_proc=128): 83%\|████████▎ \| 40933/49172 [00:15<00:02, 2872.22 examples/s] Tokenizing Prompts (num_proc=128): 84%\|████████▍ \| 41235/49172 [00:15<00:03, 2550.35 examples/s] Tokenizing Prompts (num_proc=128): 85%\|████████▍ \| 41636/49172 [00:16<00:02, 2623.37 examples/s] Tokenizing Prompts (num_proc=128): 85%\|████████▌ \| 42011/49172 [00:16<00:02, 2667.31 examples/s] Tokenizing Prompts (num_proc=128): 87%\|████████▋ \| 42565/49172 [00:16<00:02, 3151.00 examples/s] Tokenizing Prompts (num_proc=128): 87%\|████████▋ \| 42957/49172 [00:16<00:02, 3026.85 examples/s] Tokenizing Prompts (num_proc=128): 88%\|████████▊ \| 43325/49172 [00:16<00:02, 2828.92 examples/s] Tokenizing Prompts (num_proc=128): 89%\|████████▉ \| 43700/49172 [00:16<00:01, 2829.86 examples/s] Tokenizing Prompts (num_proc=128): 90%\|████████▉ \| 44022/49172 [00:16<00:01, 2649.38 examples/s] Tokenizing Prompts (num_proc=128): 90%\|█████████ \| 44422/49172 [00:17<00:01, 2713.96 examples/s] Tokenizing Prompts (num_proc=128): 91%\|█████████ \| 44782/49172 [00:17<00:01, 2703.83 examples/s] Tokenizing Prompts (num_proc=128): 92%\|█████████▏\| 45155/49172 [00:17<00:01, 2707.69 examples/s] Tokenizing Prompts (num_proc=128): 93%\|█████████▎\| 45628/49172 [00:17<00:01, 2900.02 examples/s] Tokenizing Prompts (num_proc=128): 94%\|█████████▎\| 45986/49172 [00:17<00:01, 2826.26 examples/s] Tokenizing Prompts (num_proc=128): 94%\|█████████▍\| 46333/49172 [00:17<00:01, 2738.12 examples/s] Tokenizing Prompts (num_proc=128): 95%\|█████████▌\| 46787/49172 [00:17<00:00, 3126.02 examples/s] Tokenizing Prompts (num_proc=128): 96%\|█████████▌\| 47266/49172 [00:17<00:00, 3503.55 examples/s] Tokenizing Prompts (num_proc=128): 97%\|█████████▋\| 47643/49172 [00:18<00:00, 3338.62 examples/s] Tokenizing Prompts (num_proc=128): 98%\|█████████▊\| 48031/49172 [00:18<00:00, 3013.54 examples/s] Tokenizing Prompts (num_proc=128): 99%\|█████████▊\| 48485/49172 [00:18<00:00, 3381.58 examples/s] Tokenizing Prompts (num_proc=128): 99%\|█████████▉\| 48913/49172 [00:18<00:00, 3410.78 examples/s] Tokenizing Prompts (num_proc=128): 100%\|██████████\| 49172/49172 [00:19<00:00, 2564.69 examples/s]
	[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:837642] min_input_len: 33
	[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:837642] max_input_len: 1051
	Dropping Long Sequences (>8192) (num_proc=128): 0%\| \| 0/49172 [00:00<?, ? examples/s] Dropping Long Sequences (>8192) (num_proc=128): 1%\| \| 385/49172 [00:00<01:43, 471.65 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 5%\|▌ \| 2695/49172 [00:00<00:12, 3750.47 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 13%\|█▎ \| 6545/49172 [00:01<00:04, 9405.16 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 19%\|█▉ \| 9236/49172 [00:01<00:03, 12478.76 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 25%\|██▌ \| 12308/49172 [00:01<00:02, 15931.93 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 30%\|███ \| 14996/49172 [00:01<00:01, 17930.73 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 37%\|███▋ \| 18068/49172 [00:01<00:01, 20677.88 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 44%\|████▍ \| 21524/49172 [00:01<00:01, 23483.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 50%\|█████ \| 24596/49172 [00:01<00:01, 23335.07 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 60%\|██████ \| 29588/49172 [00:01<00:00, 29166.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 69%\|██████▉ \| 33812/49172 [00:01<00:00, 31889.27 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 76%\|███████▌ \| 37268/49172 [00:02<00:00, 32125.51 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 83%\|████████▎ \| 40724/49172 [00:02<00:00, 31182.70 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 93%\|█████████▎\| 45716/49172 [00:02<00:00, 33653.89 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 100%\|██████████\| 49172/49172 [00:02<00:00, 18442.91 examples/s]
	Saving the dataset (0/128 shards): 0%\| \| 0/49172 [00:00<?, ? examples/s] Saving the dataset (0/128 shards): 1%\| \| 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (1/128 shards): 1%\| \| 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (2/128 shards): 2%\|▏ \| 770/49172 [00:02<04:25, 182.62 examples/s] Saving the dataset (3/128 shards): 2%\|▏ \| 1155/49172 [00:02<04:22, 182.62 examples/s] Saving the dataset (4/128 shards): 3%\|▎ \| 1540/49172 [00:02<04:20, 182.62 examples/s] Saving the dataset (5/128 shards): 4%\|▍ \| 1925/49172 [00:02<04:18, 182.62 examples/s] Saving the dataset (6/128 shards): 5%\|▍ \| 2310/49172 [00:02<04:16, 182.62 examples/s] Saving the dataset (7/128 shards): 5%\|▌ \| 2695/49172 [00:02<04:14, 182.62 examples/s] Saving the dataset (8/128 shards): 6%\|▋ \| 3080/49172 [00:02<04:12, 182.62 examples/s] Saving the dataset (9/128 shards): 7%\|▋ \| 3465/49172 [00:02<04:10, 182.62 examples/s] Saving the dataset (10/128 shards): 8%\|▊ \| 3850/49172 [00:02<04:08, 182.62 examples/s] Saving the dataset (11/128 shards): 9%\|▊ \| 4235/49172 [00:02<04:06, 182.62 examples/s] Saving the dataset (12/128 shards): 9%\|▉ \| 4620/49172 [00:02<04:03, 182.62 examples/s] Saving the dataset (13/128 shards): 10%\|█ \| 5005/49172 [00:02<04:01, 182.62 examples/s] Saving the dataset (14/128 shards): 11%\|█ \| 5390/49172 [00:02<03:59, 182.62 examples/s] Saving the dataset (15/128 shards): 12%\|█▏ \| 5775/49172 [00:02<03:57, 182.62 examples/s] Saving the dataset (16/128 shards): 13%\|█▎ \| 6160/49172 [00:02<03:55, 182.62 examples/s] Saving the dataset (17/128 shards): 13%\|█▎ \| 6545/49172 [00:02<03:53, 182.62 examples/s] Saving the dataset (18/128 shards): 14%\|█▍ \| 6930/49172 [00:02<03:51, 182.62 examples/s] Saving the dataset (19/128 shards): 15%\|█▍ \| 7315/49172 [00:02<03:49, 182.62 examples/s] Saving the dataset (20/128 shards): 16%\|█▌ \| 7700/49172 [00:02<03:47, 182.62 examples/s] Saving the dataset (21/128 shards): 16%\|█▋ \| 8084/49172 [00:02<03:44, 182.62 examples/s] Saving the dataset (22/128 shards): 17%\|█▋ \| 8468/49172 [00:02<03:42, 182.62 examples/s] Saving the dataset (23/128 shards): 18%\|█▊ \| 8852/49172 [00:02<03:40, 182.62 examples/s] Saving the dataset (24/128 shards): 19%\|█▉ \| 9236/49172 [00:02<03:38, 182.62 examples/s] Saving the dataset (25/128 shards): 20%\|█▉ \| 9620/49172 [00:02<03:36, 182.62 examples/s] Saving the dataset (26/128 shards): 20%\|██ \| 10004/49172 [00:02<03:34, 182.62 examples/s] Saving the dataset (27/128 shards): 21%\|██ \| 10388/49172 [00:02<03:32, 182.62 examples/s] Saving the dataset (28/128 shards): 22%\|██▏ \| 10772/49172 [00:02<03:30, 182.62 examples/s] Saving the dataset (29/128 shards): 23%\|██▎ \| 11156/49172 [00:02<03:28, 182.62 examples/s] Saving the dataset (30/128 shards): 23%\|██▎ \| 11540/49172 [00:02<03:26, 182.62 examples/s] Saving the dataset (31/128 shards): 24%\|██▍ \| 11924/49172 [00:02<03:23, 182.62 examples/s] Saving the dataset (32/128 shards): 25%\|██▌ \| 12308/49172 [00:02<03:21, 182.62 examples/s] Saving the dataset (33/128 shards): 26%\|██▌ \| 12692/49172 [00:02<03:19, 182.62 examples/s] Saving the dataset (34/128 shards): 27%\|██▋ \| 13076/49172 [00:02<03:17, 182.62 examples/s] Saving the dataset (35/128 shards): 27%\|██▋ \| 13460/49172 [00:02<03:15, 182.62 examples/s] Saving the dataset (36/128 shards): 28%\|██▊ \| 13844/49172 [00:02<03:13, 182.62 examples/s] Saving the dataset (37/128 shards): 29%\|██▉ \| 14228/49172 [00:02<03:11, 182.62 examples/s] Saving the dataset (38/128 shards): 30%\|██▉ \| 14612/49172 [00:02<03:09, 182.62 examples/s] Saving the dataset (39/128 shards): 30%\|███ \| 14996/49172 [00:02<03:07, 182.62 examples/s] Saving the dataset (40/128 shards): 31%\|███▏ \| 15380/49172 [00:02<03:05, 182.62 examples/s] Saving the dataset (41/128 shards): 32%\|███▏ \| 15764/49172 [00:02<03:02, 182.62 examples/s] Saving the dataset (42/128 shards): 33%\|███▎ \| 16148/49172 [00:02<03:00, 182.62 examples/s] Saving the dataset (43/128 shards): 34%\|███▎ \| 16532/49172 [00:02<02:58, 182.62 examples/s] Saving the dataset (44/128 shards): 34%\|███▍ \| 16916/49172 [00:02<02:56, 182.62 examples/s] Saving the dataset (45/128 shards): 35%\|███▌ \| 17300/49172 [00:02<02:54, 182.62 examples/s] Saving the dataset (46/128 shards): 36%\|███▌ \| 17684/49172 [00:02<02:52, 182.62 examples/s] Saving the dataset (47/128 shards): 37%\|███▋ \| 18068/49172 [00:02<02:50, 182.62 examples/s] Saving the dataset (48/128 shards): 38%\|███▊ \| 18452/49172 [00:02<02:48, 182.62 examples/s] Saving the dataset (49/128 shards): 38%\|███▊ \| 18836/49172 [00:02<02:46, 182.62 examples/s] Saving the dataset (50/128 shards): 39%\|███▉ \| 19220/49172 [00:02<02:44, 182.62 examples/s] Saving the dataset (51/128 shards): 40%\|███▉ \| 19604/49172 [00:02<02:41, 182.62 examples/s] Saving the dataset (52/128 shards): 41%\|████ \| 19988/49172 [00:02<02:39, 182.62 examples/s] Saving the dataset (53/128 shards): 41%\|████▏ \| 20372/49172 [00:02<02:37, 182.62 examples/s] Saving the dataset (54/128 shards): 42%\|████▏ \| 20756/49172 [00:02<02:35, 182.62 examples/s] Saving the dataset (55/128 shards): 43%\|████▎ \| 21140/49172 [00:02<02:33, 182.62 examples/s] Saving the dataset (56/128 shards): 44%\|████▍ \| 21524/49172 [00:02<02:31, 182.62 examples/s] Saving the dataset (57/128 shards): 45%\|████▍ \| 21908/49172 [00:02<02:29, 182.62 examples/s] Saving the dataset (58/128 shards): 45%\|████▌ \| 22292/49172 [00:02<02:27, 182.62 examples/s] Saving the dataset (59/128 shards): 46%\|████▌ \| 22676/49172 [00:02<02:25, 182.62 examples/s] Saving the dataset (60/128 shards): 47%\|████▋ \| 23060/49172 [00:02<02:22, 182.62 examples/s] Saving the dataset (61/128 shards): 48%\|████▊ \| 23444/49172 [00:02<02:20, 182.62 examples/s] Saving the dataset (62/128 shards): 48%\|████▊ \| 23828/49172 [00:02<02:18, 182.62 examples/s] Saving the dataset (63/128 shards): 49%\|████▉ \| 24212/49172 [00:02<02:16, 182.62 examples/s] Saving the dataset (64/128 shards): 50%\|█████ \| 24596/49172 [00:02<02:14, 182.62 examples/s] Saving the dataset (65/128 shards): 51%\|█████ \| 24980/49172 [00:02<02:12, 182.62 examples/s] Saving the dataset (66/128 shards): 52%\|█████▏ \| 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (67/128 shards): 52%\|█████▏ \| 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (68/128 shards): 53%\|█████▎ \| 26132/49172 [00:02<02:06, 182.62 examples/s] Saving the dataset (69/128 shards): 54%\|█████▍ \| 26516/49172 [00:02<02:04, 182.62 examples/s] Saving the dataset (70/128 shards): 55%\|█████▍ \| 26900/49172 [00:02<02:01, 182.62 examples/s] Saving the dataset (71/128 shards): 55%\|█████▌ \| 27284/49172 [00:02<01:59, 182.62 examples/s] Saving the dataset (72/128 shards): 56%\|█████▋ \| 27668/49172 [00:02<01:57, 182.62 examples/s] Saving the dataset (73/128 shards): 57%\|█████▋ \| 28052/49172 [00:02<01:55, 182.62 examples/s] Saving the dataset (74/128 shards): 58%\|█████▊ \| 28436/49172 [00:02<01:53, 182.62 examples/s] Saving the dataset (75/128 shards): 59%\|█████▊ \| 28820/49172 [00:02<01:51, 182.62 examples/s] Saving the dataset (76/128 shards): 59%\|█████▉ \| 29204/49172 [00:02<01:49, 182.62 examples/s] Saving the dataset (77/128 shards): 60%\|██████ \| 29588/49172 [00:02<01:47, 182.62 examples/s] Saving the dataset (78/128 shards): 61%\|██████ \| 29972/49172 [00:02<01:45, 182.62 examples/s] Saving the dataset (79/128 shards): 62%\|██████▏ \| 30356/49172 [00:02<01:43, 182.62 examples/s] Saving the dataset (80/128 shards): 63%\|██████▎ \| 30740/49172 [00:02<01:40, 182.62 examples/s] Saving the dataset (81/128 shards): 63%\|██████▎ \| 31124/49172 [00:02<01:38, 182.62 examples/s] Saving the dataset (82/128 shards): 64%\|██████▍ \| 31508/49172 [00:02<01:36, 182.62 examples/s] Saving the dataset (83/128 shards): 65%\|██████▍ \| 31892/49172 [00:02<01:34, 182.62 examples/s] Saving the dataset (84/128 shards): 66%\|██████▌ \| 32276/49172 [00:02<01:32, 182.62 examples/s] Saving the dataset (85/128 shards): 66%\|██████▋ \| 32660/49172 [00:02<01:30, 182.62 examples/s] Saving the dataset (86/128 shards): 67%\|██████▋ \| 33044/49172 [00:02<01:28, 182.62 examples/s] Saving the dataset (87/128 shards): 68%\|██████▊ \| 33428/49172 [00:02<01:26, 182.62 examples/s] Saving the dataset (88/128 shards): 69%\|██████▉ \| 33812/49172 [00:02<01:24, 182.62 examples/s] Saving the dataset (89/128 shards): 70%\|███████ \| 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (90/128 shards): 70%\|███████ \| 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (91/128 shards): 71%\|███████ \| 34964/49172 [00:02<01:17, 182.62 examples/s] Saving the dataset (92/128 shards): 72%\|███████▏ \| 35348/49172 [00:02<01:15, 182.62 examples/s] Saving the dataset (93/128 shards): 73%\|███████▎ \| 35732/49172 [00:02<01:13, 182.62 examples/s] Saving the dataset (94/128 shards): 73%\|███████▎ \| 36116/49172 [00:02<01:11, 182.62 examples/s] Saving the dataset (95/128 shards): 74%\|███████▍ \| 36500/49172 [00:02<01:09, 182.62 examples/s] Saving the dataset (96/128 shards): 75%\|███████▌ \| 36884/49172 [00:02<01:07, 182.62 examples/s] Saving the dataset (97/128 shards): 76%\|███████▌ \| 37268/49172 [00:02<01:05, 182.62 examples/s] Saving the dataset (98/128 shards): 77%\|███████▋ \| 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (99/128 shards): 77%\|███████▋ \| 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (100/128 shards): 78%\|███████▊ \| 38420/49172 [00:02<00:58, 182.62 examples/s] Saving the dataset (101/128 shards): 79%\|███████▉ \| 38804/49172 [00:02<00:56, 182.62 examples/s] Saving the dataset (102/128 shards): 80%\|███████▉ \| 39188/49172 [00:02<00:54, 182.62 examples/s] Saving the dataset (103/128 shards): 80%\|████████ \| 39572/49172 [00:02<00:52, 182.62 examples/s] Saving the dataset (104/128 shards): 81%\|████████▏ \| 39956/49172 [00:02<00:50, 182.62 examples/s] Saving the dataset (105/128 shards): 82%\|████████▏ \| 40340/49172 [00:02<00:48, 182.62 examples/s] Saving the dataset (106/128 shards): 83%\|████████▎ \| 40724/49172 [00:02<00:46, 182.62 examples/s] Saving the dataset (107/128 shards): 84%\|████████▎ \| 41108/49172 [00:02<00:44, 182.62 examples/s] Saving the dataset (108/128 shards): 84%\|████████▍ \| 41492/49172 [00:02<00:42, 182.62 examples/s] Saving the dataset (109/128 shards): 85%\|████████▌ \| 41876/49172 [00:02<00:39, 182.62 examples/s] Saving the dataset (110/128 shards): 86%\|████████▌ \| 42260/49172 [00:02<00:37, 182.62 examples/s] Saving the dataset (111/128 shards): 87%\|████████▋ \| 42644/49172 [00:02<00:35, 182.62 examples/s] Saving the dataset (112/128 shards): 88%\|████████▊ \| 43028/49172 [00:02<00:33, 182.62 examples/s] Saving the dataset (113/128 shards): 88%\|████████▊ \| 43412/49172 [00:02<00:31, 182.62 examples/s] Saving the dataset (114/128 shards): 89%\|████████▉ \| 43796/49172 [00:02<00:29, 182.62 examples/s] Saving the dataset (115/128 shards): 90%\|████████▉ \| 44180/49172 [00:02<00:27, 182.62 examples/s] Saving the dataset (116/128 shards): 91%\|█████████▏\| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (117/128 shards): 91%\|█████████▏\| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (118/128 shards): 92%\|█████████▏\| 45332/49172 [00:02<00:21, 182.62 examples/s] Saving the dataset (119/128 shards): 93%\|█████████▎\| 45716/49172 [00:02<00:18, 182.62 examples/s] Saving the dataset (120/128 shards): 94%\|█████████▍\| 46100/49172 [00:02<00:16, 182.62 examples/s] Saving the dataset (121/128 shards): 95%\|█████████▍\| 46484/49172 [00:02<00:14, 182.62 examples/s] Saving the dataset (122/128 shards): 95%\|█████████▌\| 46868/49172 [00:02<00:12, 182.62 examples/s] Saving the dataset (123/128 shards): 96%\|█████████▌\| 47252/49172 [00:02<00:10, 182.62 examples/s] Saving the dataset (124/128 shards): 97%\|█████████▋\| 47636/49172 [00:02<00:08, 182.62 examples/s] Saving the dataset (125/128 shards): 98%\|█████████▊\| 48020/49172 [00:02<00:06, 182.62 examples/s] Saving the dataset (126/128 shards): 98%\|█████████▊\| 48404/49172 [00:02<00:04, 182.62 examples/s] Saving the dataset (127/128 shards): 99%\|█████████▉\| 48788/49172 [00:02<00:02, 182.62 examples/s] Saving the dataset (128/128 shards): 100%\|██████████\| 49172/49172 [00:02<00:00, 182.62 examples/s] Saving the dataset (128/128 shards): 100%\|██████████\| 49172/49172 [00:02<00:00, 22251.27 examples/s]
	[2025-11-06 16:12:34,233] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:837642] total_num_tokens: 9_208_425
	[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:837642] `total_supervised_tokens: 6_847_432`
	[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:837642] total_num_steps: 769
	[2025-11-06 16:12:34,425] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:837642] Maximum number of steps set at 769
	[2025-11-06 16:12:34,441] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:837642] Loading tokenizer... meta-llama/Llama-3.2-3B
	[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <\|end_of_text\|>
	[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <\|begin_of_text\|>
	[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <\|finetune_right_pad_id\|>
	[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
	[2025-11-06 16:12:35,271] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
	[2025-11-06 16:12:35,271] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:837642] Loading model
	[2025-11-06 16:12:35,502] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:837642] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2025-11-06 16:12:35,503] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:837642] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2025-11-06 16:12:35,531] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:837642] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
	Loading checkpoint shards: 0%\| \| 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 100%\|██████████\| 2/2 [00:00<00:00, 86.01it/s]
	[2025-11-06 16:15:46,317] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:837642] Converting modules to torch.bfloat16
	[2025-11-06 16:15:59,472] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:837642] Memory usage after model load 0.000GB ()
	[2025-11-06 16:16:00,600] [WARNING] [accelerate.utils.other.check_os_kernel:512] [PID:837642] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
	[2025-11-06 16:16:07,189] [INFO] [axolotl.train.save_initial_configs:412] [PID:837642] Pre-saving tokenizer to ./outputs/qat_out/...
	[2025-11-06 16:16:07,288] [INFO] [axolotl.train.save_initial_configs:417] [PID:837642] Pre-saving model config to ./outputs/qat_out/...
	[2025-11-06 16:16:07,290] [INFO] [axolotl.train.execute_training:203] [PID:837642] Starting trainer...
	0%\| \| 0/769 [00:00<?, ?it/s] 0%\| \| 1/769 [00:10<2:09:18, 10.10s/it] {'loss': 1.1473, 'grad_norm': 4.625, 'learning_rate': 0.0, 'memory/max_active (GiB)': 34.78, 'memory/max_allocated (GiB)': 34.78, 'memory/device_reserved (GiB)': 41.06, 'tokens_per_second_per_gpu': 879.01, 'epoch': 0.0}
	0%\| \| 1/769 [00:10<2:09:18, 10.10s/it][2025-11-06 16:16:17,683] [INFO] [axolotl.core.trainers.base._save:671] [PID:837642] Saving model checkpoint to ./outputs/qat_out/checkpoint-1
	0%\| \| 2/769 [00:28<3:13:22, 15.13s/it] {'loss': 1.1048, 'grad_norm': 4.34375, 'learning_rate': 2.6315789473684213e-07, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 55.05, 'tokens_per_second_per_gpu': 1653.05, 'epoch': 0.0}
	0%\| \| 2/769 [00:28<3:13:22, 15.13s/it] 0%\| \| 3/769 [00:35<2:23:03, 11.21s/it] {'loss': 1.1442, 'grad_norm': 4.6875, 'learning_rate': 5.263157894736843e-07, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1475.08, 'epoch': 0.0}
	0%\| \| 3/769 [00:35<2:23:03, 11.21s/it] 1%\| \| 4/769 [00:40<1:55:11, 9.03s/it] {'loss': 1.1473, 'grad_norm': 3.671875, 'learning_rate': 7.894736842105263e-07, 'memory/max_active (GiB)': 46.8, 'memory/max_allocated (GiB)': 46.8, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 2080.31, 'epoch': 0.01}
	1%\| \| 4/769 [00:41<1:55:11, 9.03s/it] 1%\| \| 5/769 [00:46<1:39:41, 7.83s/it] {'loss': 1.1704, 'grad_norm': 4.15625, 'learning_rate': 1.0526315789473685e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1673.31, 'epoch': 0.01}
	1%\| \| 5/769 [00:46<1:39:41, 7.83s/it] 1%\| \| 6/769 [00:51<1:28:32, 6.96s/it] {'loss': 1.1557, 'grad_norm': 4.09375, 'learning_rate': 1.3157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1894.62, 'epoch': 0.01}
	1%\| \| 6/769 [00:51<1:28:32, 6.96s/it] 1%\| \| 7/769 [00:57<1:20:49, 6.36s/it] {'loss': 1.1819, 'grad_norm': 5.21875, 'learning_rate': 1.5789473684210526e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.51, 'tokens_per_second_per_gpu': 1854.54, 'epoch': 0.01}
	1%\| \| 7/769 [00:57<1:20:49, 6.36s/it] 1%\| \| 8/769 [01:02<1:18:07, 6.16s/it] {'loss': 1.1807, 'grad_norm': 4.125, 'learning_rate': 1.8421052631578948e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1751.39, 'epoch': 0.01}
	1%\| \| 8/769 [01:02<1:18:07, 6.16s/it] 1%\| \| 9/769 [01:08<1:16:07, 6.01s/it] {'loss': 1.1324, 'grad_norm': 4.40625, 'learning_rate': 2.105263157894737e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1412.84, 'epoch': 0.01}
	1%\| \| 9/769 [01:08<1:16:07, 6.01s/it] 1%\|▏ \| 10/769 [01:13<1:12:34, 5.74s/it] {'loss': 1.1556, 'grad_norm': 3.78125, 'learning_rate': 2.368421052631579e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1849.28, 'epoch': 0.01}
	1%\|▏ \| 10/769 [01:13<1:12:34, 5.74s/it] 1%\|▏ \| 11/769 [01:19<1:12:17, 5.72s/it] {'loss': 1.1855, 'grad_norm': 3.859375, 'learning_rate': 2.631578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1624.94, 'epoch': 0.01}
	1%\|▏ \| 11/769 [01:19<1:12:17, 5.72s/it] 2%\|▏ \| 12/769 [01:24<1:12:03, 5.71s/it] {'loss': 1.1024, 'grad_norm': 4.09375, 'learning_rate': 2.8947368421052634e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1463.62, 'epoch': 0.02}
	2%\|▏ \| 12/769 [01:25<1:12:03, 5.71s/it] 2%\|▏ \| 13/769 [01:29<1:08:10, 5.41s/it] {'loss': 1.1591, 'grad_norm': 3.765625, 'learning_rate': 3.157894736842105e-06, 'memory/max_active (GiB)': 41.5, 'memory/max_allocated (GiB)': 41.5, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1910.89, 'epoch': 0.02}
	2%\|▏ \| 13/769 [01:29<1:08:10, 5.41s/it] 2%\|▏ \| 14/769 [01:35<1:09:08, 5.49s/it] {'loss': 1.0844, 'grad_norm': 3.484375, 'learning_rate': 3.421052631578948e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1588.42, 'epoch': 0.02}
	2%\|▏ \| 14/769 [01:35<1:09:08, 5.49s/it] 2%\|▏ \| 15/769 [01:39<1:05:28, 5.21s/it] {'loss': 1.2444, 'grad_norm': 3.984375, 'learning_rate': 3.6842105263157896e-06, 'memory/max_active (GiB)': 41.46, 'memory/max_allocated (GiB)': 41.46, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1568.51, 'epoch': 0.02}
	2%\|▏ \| 15/769 [01:39<1:05:28, 5.21s/it] 2%\|▏ \| 16/769 [01:45<1:07:09, 5.35s/it] {'loss': 1.1609, 'grad_norm': 3.578125, 'learning_rate': 3.947368421052632e-06, 'memory/max_active (GiB)': 46.74, 'memory/max_allocated (GiB)': 46.74, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1338.16, 'epoch': 0.02}
	2%\|▏ \| 16/769 [01:45<1:07:09, 5.35s/it] 2%\|▏ \| 17/769 [01:51<1:08:21, 5.45s/it] {'loss': 1.099, 'grad_norm': 3.125, 'learning_rate': 4.210526315789474e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1704.75, 'epoch': 0.02}
	2%\|▏ \| 17/769 [01:51<1:08:21, 5.45s/it] 2%\|▏ \| 18/769 [01:57<1:09:09, 5.53s/it] {'loss': 1.0828, 'grad_norm': 3.046875, 'learning_rate': 4.473684210526316e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1664.81, 'epoch': 0.02}
	2%\|▏ \| 18/769 [01:57<1:09:09, 5.53s/it] 2%\|▏ \| 19/769 [02:02<1:07:39, 5.41s/it] {'loss': 1.2319, 'grad_norm': 3.4375, 'learning_rate': 4.736842105263158e-06, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1602.68, 'epoch': 0.02}
	2%\|▏ \| 19/769 [02:02<1:07:39, 5.41s/it] 3%\|▎ \| 20/769 [02:07<1:08:44, 5.51s/it] {'loss': 1.0638, 'grad_norm': 2.953125, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.13, 'tokens_per_second_per_gpu': 1755.41, 'epoch': 0.03}
	3%\|▎ \| 20/769 [02:07<1:08:44, 5.51s/it] 3%\|▎ \| 21/769 [02:13<1:09:22, 5.56s/it] {'loss': 1.2072, 'grad_norm': 2.859375, 'learning_rate': 5.263157894736842e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1833.74, 'epoch': 0.03}
	3%\|▎ \| 21/769 [02:13<1:09:22, 5.56s/it] 3%\|▎ \| 22/769 [02:18<1:07:37, 5.43s/it] {'loss': 1.2006, 'grad_norm': 3.109375, 'learning_rate': 5.526315789473685e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1567.66, 'epoch': 0.03}
	3%\|▎ \| 22/769 [02:18<1:07:37, 5.43s/it] 3%\|▎ \| 23/769 [02:25<1:12:34, 5.84s/it] {'loss': 1.0081, 'grad_norm': 2.625, 'learning_rate': 5.789473684210527e-06, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1412.25, 'epoch': 0.03}
	3%\|▎ \| 23/769 [02:25<1:12:34, 5.84s/it] 3%\|▎ \| 24/769 [02:31<1:12:04, 5.80s/it] {'loss': 1.1437, 'grad_norm': 2.6875, 'learning_rate': 6.0526315789473685e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.26, 'tokens_per_second_per_gpu': 1749.27, 'epoch': 0.03}
	3%\|▎ \| 24/769 [02:31<1:12:04, 5.80s/it] 3%\|▎ \| 25/769 [02:36<1:11:33, 5.77s/it] {'loss': 1.0987, 'grad_norm': 2.765625, 'learning_rate': 6.31578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1627.64, 'epoch': 0.03}
	3%\|▎ \| 25/769 [02:36<1:11:33, 5.77s/it] 3%\|▎ \| 26/769 [02:42<1:09:06, 5.58s/it] {'loss': 1.1046, 'grad_norm': 2.6875, 'learning_rate': 6.578947368421054e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1967.95, 'epoch': 0.03}
	3%\|▎ \| 26/769 [02:42<1:09:06, 5.58s/it] 4%\|▎ \| 27/769 [02:47<1:09:27, 5.62s/it] {'loss': 1.1072, 'grad_norm': 2.8125, 'learning_rate': 6.842105263157896e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1594.62, 'epoch': 0.04}
	4%\|▎ \| 27/769 [02:47<1:09:27, 5.62s/it] 4%\|▎ \| 28/769 [02:52<1:05:29, 5.30s/it] {'loss': 1.1974, 'grad_norm': 3.03125, 'learning_rate': 7.1052631578947375e-06, 'memory/max_active (GiB)': 41.47, 'memory/max_allocated (GiB)': 41.47, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1766.49, 'epoch': 0.04}
	4%\|▎ \| 28/769 [02:52<1:05:29, 5.30s/it] 4%\|▍ \| 29/769 [02:58<1:06:52, 5.42s/it] {'loss': 1.2164, 'grad_norm': 2.734375, 'learning_rate': 7.368421052631579e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1593.41, 'epoch': 0.04}
	4%\|▍ \| 29/769 [02:58<1:06:52, 5.42s/it] 4%\|▍ \| 30/769 [03:03<1:07:46, 5.50s/it] {'loss': 1.1324, 'grad_norm': 2.65625, 'learning_rate': 7.631578947368423e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1607.66, 'epoch': 0.04}
	4%\|▍ \| 30/769 [03:03<1:07:46, 5.50s/it] 4%\|▍ \| 31/769 [03:09<1:08:21, 5.56s/it] {'loss': 1.0693, 'grad_norm': 2.96875, 'learning_rate': 7.894736842105265e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1454.28, 'epoch': 0.04}
	4%\|▍ \| 31/769 [03:09<1:08:21, 5.56s/it] 4%\|▍ \| 32/769 [03:14<1:06:43, 5.43s/it] {'loss': 1.0988, 'grad_norm': 2.65625, 'learning_rate': 8.157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1972.66, 'epoch': 0.04}
	4%\|▍ \| 32/769 [03:14<1:06:43, 5.43s/it] 4%\|▍ \| 33/769 [03:19<1:03:27, 5.17s/it] {'loss': 1.074, 'grad_norm': 2.84375, 'learning_rate': 8.421052631578948e-06, 'memory/max_active (GiB)': 41.48, 'memory/max_allocated (GiB)': 41.48, 'memory/device_reserved (GiB)': 77.13, 'tokens_per_second_per_gpu': 1705.8, 'epoch': 0.04}
	4%\|▍ \| 33/769 [03:19<1:03:27, 5.17s/it] 4%\|▍ \| 34/769 [03:24<1:03:14, 5.16s/it] {'loss': 1.1673, 'grad_norm': 2.671875, 'learning_rate': 8.68421052631579e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 76.88, 'tokens_per_second_per_gpu': 1761.02, 'epoch': 0.04}
	4%\|▍ \| 34/769 [03:24<1:03:14, 5.16s/it] 5%\|▍ \| 35/769 [03:29<1:02:59, 5.15s/it] {'loss': 1.1655, 'grad_norm': 3.03125, 'learning_rate': 8.947368421052632e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 76.01, 'tokens_per_second_per_gpu': 1479.42, 'epoch': 0.05}
	5%\|▍ \| 35/769 [03:29<1:02:59, 5.15s/it] 5%\|▍ \| 36/769 [03:35<1:04:53, 5.31s/it] {'loss': 1.1483, 'grad_norm': 2.71875, 'learning_rate': 9.210526315789474e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1466.59, 'epoch': 0.05}
	5%\|▍ \| 36/769 [03:35<1:04:53, 5.31s/it] 5%\|▍ \| 37/769 [03:40<1:04:09, 5.26s/it] {'loss': 1.159, 'grad_norm': 2.578125, 'learning_rate': 9.473684210526315e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1806.68, 'epoch': 0.05}
	5%\|▍ \| 37/769 [03:40<1:04:09, 5.26s/it] 5%\|▍ \| 38/769 [03:45<1:05:41, 5.39s/it] {'loss': 1.1035, 'grad_norm': 2.625, 'learning_rate': 9.736842105263159e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1689.12, 'epoch': 0.05}
	5%\|▍ \| 38/769 [03:45<1:05:41, 5.39s/it] 5%\|▌ \| 39/769 [03:51<1:06:41, 5.48s/it] {'loss': 1.0471, 'grad_norm': 2.609375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1579.8, 'epoch': 0.05}
	5%\|▌ \| 39/769 [03:51<1:06:41, 5.48s/it] 5%\|▌ \| 40/769 [03:56<1:05:20, 5.38s/it] {'loss': 1.2091, 'grad_norm': 2.96875, 'learning_rate': 1.0263157894736844e-05, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1542.1, 'epoch': 0.05}
	5%\|▌ \| 40/769 [03:56<1:05:20, 5.38s/it] 5%\|▌ \| 41/769 [04:02<1:06:25, 5.47s/it] {'loss': 1.0722, 'grad_norm': 2.59375, 'learning_rate': 1.0526315789473684e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1585.53, 'epoch': 0.05}
	5%\|▌ \| 41/769 [04:02<1:06:25, 5.47s/it]