| [2025-11-06 16:11:41,516] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:837642] baseline 0.000GB () | |
| [2025-11-06 16:11:41,516] [INFO] [axolotl.cli.config.load_cfg:248] [PID:837642] config: | |
| { | |
| "activation_offloading": false, | |
| "axolotl_config_path": "3b-qat-nvfp4.yaml", | |
| "base_model": "meta-llama/Llama-3.2-3B", | |
| "base_model_config": "meta-llama/Llama-3.2-3B", | |
| "batch_size": 64, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_90", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "context_parallel_size": 1, | |
| "cosine_constant_lr_ratio": 0.0, | |
| "cosine_min_lr_ratio": 1.0, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_prepared_path": "./outputs/dataset_prepared", | |
| "dataset_processes": 128, | |
| "datasets": [ | |
| { | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "yahma/alpaca-cleaned", | |
| "split": "train[:95%]", | |
| "trust_remote_code": false, | |
| "type": "alpaca" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.8.0" | |
| }, | |
| "eval_batch_size": 64, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_table_size": 0, | |
| "evals_per_epoch": 1, | |
| "experimental_skip_move_to_device": true, | |
| "flash_attention": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 1, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": true | |
| }, | |
| "hub_model_id": "AlexHung29629/3b-qat-nvfp4", | |
| "include_tkps": true, | |
| "is_llama_derived_model": true, | |
| "learning_rate": 2e-05, | |
| "liger_fused_linear_cross_entropy": true, | |
| "liger_glu_activation": true, | |
| "liger_layer_norm": true, | |
| "liger_rms_norm": true, | |
| "liger_rope": true, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "logging_steps": 1, | |
| "lora_dropout": 0.0, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "cosine", | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 64, | |
| "model_config_type": "llama", | |
| "num_epochs": 1.0, | |
| "optimizer": "adamw_torch_fused", | |
| "output_dir": "./outputs/qat_out/", | |
| "plugins": [ | |
| "axolotl.integrations.liger.LigerPlugin" | |
| ], | |
| "pretrain_multipack_attn": true, | |
| "profiler_steps_start": 0, | |
| "qat": { | |
| "activation_dtype": "TorchAOQuantDType.nvfp4", | |
| "group_size": 16, | |
| "quantize_embedding": false, | |
| "weight_dtype": "TorchAOQuantDType.nvfp4" | |
| }, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_first_step": true, | |
| "save_only_model": true, | |
| "save_safetensors": true, | |
| "saves_per_epoch": 1, | |
| "sequence_len": 8192, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "special_tokens": { | |
| "pad_token": "<|finetune_right_pad_id|>" | |
| }, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "meta-llama/Llama-3.2-3B", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "use_ray": false, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "warmup_ratio": 0.1, | |
| "weight_decay": 0.0, | |
| "world_size": 1 | |
| } | |
| [2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|> | |
| [2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|> | |
| [2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|> | |
| [2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None | |
| [2025-11-06 16:11:46,489] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference. | |
| [2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:837642] Unable to find prepared dataset in outputs/dataset_prepared/9bc662aed65b76546b2d635b3957a343 | |
| [2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:837642] Loading raw datasets... | |
| [2025-11-06 16:11:46,492] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:837642] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. | |
| Generating train split: 0%| | 0/51760 [00:00<?, ? examples/s] Generating train split: 100%|ββββββββββ| 51760/51760 [00:00<00:00, 229845.07 examples/s] Generating train split: 100%|ββββββββββ| 51760/51760 [00:00<00:00, 228708.96 examples/s] | |
| [2025-11-06 16:12:08,982] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:837642] Loading dataset: yahma/alpaca-cleaned with base_type: alpaca and prompt_style: None | |
| Tokenizing Prompts (num_proc=128): 0%| | 0/49172 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=128): 0%| | 84/49172 [00:01<13:38, 59.94 examples/s] Tokenizing Prompts (num_proc=128): 0%| | 176/49172 [00:01<06:01, 135.38 examples/s] Tokenizing Prompts (num_proc=128): 2%|β | 792/49172 [00:01<01:05, 743.93 examples/s] Tokenizing Prompts (num_proc=128): 3%|β | 1289/49172 [00:01<00:39, 1224.84 examples/s] Tokenizing Prompts (num_proc=128): 3%|β | 1663/49172 [00:01<00:30, 1550.01 examples/s] Tokenizing Prompts (num_proc=128): 4%|β | 2053/49172 [00:02<00:25, 1838.37 examples/s] Tokenizing Prompts (num_proc=128): 5%|β | 2462/49172 [00:02<00:22, 2120.82 examples/s] Tokenizing Prompts (num_proc=128): 6%|β | 2931/49172 [00:02<00:17, 2593.78 examples/s] Tokenizing Prompts (num_proc=128): 7%|β | 3316/49172 [00:02<00:17, 2658.78 examples/s] Tokenizing Prompts (num_proc=128): 8%|β | 3704/49172 [00:02<00:16, 2738.71 examples/s] Tokenizing Prompts (num_proc=128): 8%|β | 4072/49172 [00:02<00:16, 2765.32 examples/s] Tokenizing Prompts (num_proc=128): 9%|β | 4392/49172 [00:02<00:16, 2672.66 examples/s] Tokenizing Prompts (num_proc=128): 10%|β | 4715/49172 [00:03<00:17, 2538.79 examples/s] Tokenizing Prompts (num_proc=128): 10%|β | 5094/49172 [00:03<00:17, 2586.15 examples/s] Tokenizing Prompts (num_proc=128): 11%|β | 5504/49172 [00:03<00:16, 2692.82 examples/s] Tokenizing Prompts (num_proc=128): 12%|ββ | 5910/49172 [00:03<00:15, 2763.20 examples/s] Tokenizing Prompts (num_proc=128): 13%|ββ | 6311/49172 [00:03<00:14, 2975.07 examples/s] Tokenizing Prompts (num_proc=128): 14%|ββ | 6668/49172 [00:03<00:14, 2859.49 examples/s] Tokenizing Prompts (num_proc=128): 14%|ββ | 7058/49172 [00:03<00:14, 2844.49 examples/s] Tokenizing Prompts (num_proc=128): 15%|ββ | 7385/49172 [00:03<00:15, 2728.62 examples/s] Tokenizing Prompts (num_proc=128): 16%|ββ | 7836/49172 [00:04<00:13, 3015.74 examples/s] Tokenizing Prompts (num_proc=128): 17%|ββ | 8193/49172 [00:04<00:14, 2873.13 examples/s] Tokenizing Prompts (num_proc=128): 17%|ββ | 8537/49172 [00:04<00:14, 2768.24 examples/s] Tokenizing Prompts (num_proc=128): 18%|ββ | 8959/49172 [00:04<00:13, 2885.47 examples/s] Tokenizing Prompts (num_proc=128): 19%|ββ | 9339/49172 [00:04<00:14, 2805.07 examples/s] Tokenizing Prompts (num_proc=128): 20%|ββ | 9755/49172 [00:04<00:13, 2846.53 examples/s] Tokenizing Prompts (num_proc=128): 21%|ββ | 10145/49172 [00:04<00:13, 2839.00 examples/s] Tokenizing Prompts (num_proc=128): 22%|βββ | 10628/49172 [00:05<00:12, 3040.97 examples/s] Tokenizing Prompts (num_proc=128): 22%|βββ | 11039/49172 [00:05<00:11, 3253.40 examples/s] Tokenizing Prompts (num_proc=128): 23%|βββ | 11385/49172 [00:05<00:12, 2982.44 examples/s] Tokenizing Prompts (num_proc=128): 24%|βββ | 11692/49172 [00:05<00:13, 2756.63 examples/s] Tokenizing Prompts (num_proc=128): 25%|βββ | 12063/49172 [00:05<00:13, 2720.60 examples/s] Tokenizing Prompts (num_proc=128): 25%|βββ | 12463/49172 [00:05<00:13, 2810.95 examples/s] Tokenizing Prompts (num_proc=128): 26%|βββ | 12850/49172 [00:05<00:12, 2819.15 examples/s] Tokenizing Prompts (num_proc=128): 27%|βββ | 13292/49172 [00:06<00:12, 2922.32 examples/s] Tokenizing Prompts (num_proc=128): 28%|βββ | 13600/49172 [00:06<00:13, 2700.65 examples/s] Tokenizing Prompts (num_proc=128): 28%|βββ | 13989/49172 [00:06<00:12, 2716.86 examples/s] Tokenizing Prompts (num_proc=128): 29%|βββ | 14475/49172 [00:06<00:12, 2852.98 examples/s] Tokenizing Prompts (num_proc=128): 30%|βββ | 14914/49172 [00:06<00:11, 2938.70 examples/s] Tokenizing Prompts (num_proc=128): 31%|ββββ | 15412/49172 [00:06<00:10, 3223.11 examples/s] Tokenizing Prompts (num_proc=128): 32%|ββββ | 15748/49172 [00:06<00:11, 3025.74 examples/s] Tokenizing Prompts (num_proc=128): 33%|ββββ | 16102/49172 [00:06<00:11, 2858.09 examples/s] Tokenizing Prompts (num_proc=128): 33%|ββββ | 16438/49172 [00:07<00:12, 2720.40 examples/s] Tokenizing Prompts (num_proc=128): 34%|ββββ | 16767/49172 [00:07<00:12, 2595.58 examples/s] Tokenizing Prompts (num_proc=128): 35%|ββββ | 17149/49172 [00:07<00:12, 2665.96 examples/s] Tokenizing Prompts (num_proc=128): 36%|ββββ | 17516/49172 [00:07<00:11, 2643.92 examples/s] Tokenizing Prompts (num_proc=128): 37%|ββββ | 17961/49172 [00:07<00:10, 2849.24 examples/s] Tokenizing Prompts (num_proc=128): 37%|ββββ | 18279/49172 [00:07<00:10, 2822.57 examples/s] Tokenizing Prompts (num_proc=128): 38%|ββββ | 18574/49172 [00:07<00:11, 2591.21 examples/s] Tokenizing Prompts (num_proc=128): 39%|ββββ | 18957/49172 [00:08<00:11, 2625.56 examples/s] Tokenizing Prompts (num_proc=128): 40%|ββββ | 19438/49172 [00:08<00:10, 2894.61 examples/s] Tokenizing Prompts (num_proc=128): 40%|ββββ | 19776/49172 [00:08<00:10, 2741.27 examples/s] Tokenizing Prompts (num_proc=128): 41%|ββββ | 20133/49172 [00:08<00:10, 2676.71 examples/s] Tokenizing Prompts (num_proc=128): 42%|βββββ | 20582/49172 [00:08<00:10, 2813.67 examples/s] Tokenizing Prompts (num_proc=128): 43%|βββββ | 21056/49172 [00:08<00:09, 2991.88 examples/s] Tokenizing Prompts (num_proc=128): 44%|βββββ | 21507/49172 [00:08<00:08, 3120.60 examples/s] Tokenizing Prompts (num_proc=128): 44%|βββββ | 21844/49172 [00:09<00:09, 2927.37 examples/s] Tokenizing Prompts (num_proc=128): 45%|βββββ | 22214/49172 [00:09<00:09, 2980.95 examples/s] Tokenizing Prompts (num_proc=128): 46%|βββββ | 22524/49172 [00:09<00:09, 2740.32 examples/s] Tokenizing Prompts (num_proc=128): 46%|βββββ | 22828/49172 [00:09<00:09, 2639.81 examples/s] Tokenizing Prompts (num_proc=128): 47%|βββββ | 23146/49172 [00:09<00:10, 2510.35 examples/s] Tokenizing Prompts (num_proc=128): 48%|βββββ | 23619/49172 [00:09<00:09, 2763.04 examples/s] Tokenizing Prompts (num_proc=128): 49%|βββββ | 24069/49172 [00:09<00:08, 2964.79 examples/s] Tokenizing Prompts (num_proc=128): 50%|βββββ | 24455/49172 [00:09<00:08, 2924.79 examples/s] Tokenizing Prompts (num_proc=128): 50%|βββββ | 24809/49172 [00:10<00:08, 2856.24 examples/s] Tokenizing Prompts (num_proc=128): 51%|βββββ | 25112/49172 [00:10<00:09, 2621.31 examples/s] Tokenizing Prompts (num_proc=128): 52%|ββββββ | 25522/49172 [00:10<00:08, 2714.48 examples/s] Tokenizing Prompts (num_proc=128): 53%|ββββββ | 25873/49172 [00:10<00:08, 2705.01 examples/s] Tokenizing Prompts (num_proc=128): 54%|ββββββ | 26322/49172 [00:10<00:07, 2907.63 examples/s] Tokenizing Prompts (num_proc=128): 54%|ββββββ | 26671/49172 [00:10<00:08, 2787.83 examples/s] Tokenizing Prompts (num_proc=128): 55%|ββββββ | 27127/49172 [00:10<00:07, 2923.69 examples/s] Tokenizing Prompts (num_proc=128): 56%|ββββββ | 27506/49172 [00:11<00:07, 2929.98 examples/s] Tokenizing Prompts (num_proc=128): 57%|ββββββ | 27928/49172 [00:11<00:07, 2972.71 examples/s] Tokenizing Prompts (num_proc=128): 58%|ββββββ | 28285/49172 [00:11<00:06, 3017.41 examples/s] Tokenizing Prompts (num_proc=128): 58%|ββββββ | 28589/49172 [00:11<00:07, 2769.13 examples/s] Tokenizing Prompts (num_proc=128): 59%|ββββββ | 28960/49172 [00:11<00:07, 2746.25 examples/s] Tokenizing Prompts (num_proc=128): 60%|ββββββ | 29343/49172 [00:11<00:07, 2735.06 examples/s] Tokenizing Prompts (num_proc=128): 60%|ββββββ | 29730/49172 [00:11<00:07, 2777.41 examples/s] Tokenizing Prompts (num_proc=128): 61%|βββββββ | 30201/49172 [00:11<00:06, 3027.86 examples/s] Tokenizing Prompts (num_proc=128): 62%|βββββββ | 30511/49172 [00:12<00:06, 2794.02 examples/s] Tokenizing Prompts (num_proc=128): 63%|βββββββ | 30963/49172 [00:12<00:06, 2860.65 examples/s] Tokenizing Prompts (num_proc=128): 64%|βββββββ | 31260/49172 [00:12<00:06, 2659.00 examples/s] Tokenizing Prompts (num_proc=128): 64%|βββββββ | 31661/49172 [00:12<00:06, 2744.06 examples/s] Tokenizing Prompts (num_proc=128): 65%|βββββββ | 32103/49172 [00:12<00:05, 2886.04 examples/s] Tokenizing Prompts (num_proc=128): 66%|βββββββ | 32523/49172 [00:12<00:05, 2982.93 examples/s] Tokenizing Prompts (num_proc=128): 67%|βββββββ | 32894/49172 [00:12<00:05, 2895.08 examples/s] Tokenizing Prompts (num_proc=128): 68%|βββββββ | 33277/49172 [00:13<00:05, 3037.84 examples/s] Tokenizing Prompts (num_proc=128): 68%|βββββββ | 33649/49172 [00:13<00:05, 2842.83 examples/s] Tokenizing Prompts (num_proc=128): 69%|βββββββ | 33971/49172 [00:13<00:05, 2690.27 examples/s] Tokenizing Prompts (num_proc=128): 70%|βββββββ | 34337/49172 [00:13<00:05, 2695.46 examples/s] Tokenizing Prompts (num_proc=128): 71%|βββββββ | 34730/49172 [00:13<00:05, 2719.08 examples/s] Tokenizing Prompts (num_proc=128): 72%|ββββββββ | 35193/49172 [00:13<00:04, 2910.87 examples/s] Tokenizing Prompts (num_proc=128): 72%|ββββββββ | 35583/49172 [00:13<00:04, 2876.12 examples/s] Tokenizing Prompts (num_proc=128): 73%|ββββββββ | 35965/49172 [00:14<00:04, 2847.46 examples/s] Tokenizing Prompts (num_proc=128): 74%|ββββββββ | 36341/49172 [00:14<00:04, 2829.40 examples/s] Tokenizing Prompts (num_proc=128): 75%|ββββββββ | 36647/49172 [00:14<00:04, 2673.80 examples/s] Tokenizing Prompts (num_proc=128): 75%|ββββββββ | 37103/49172 [00:14<00:04, 2839.03 examples/s] Tokenizing Prompts (num_proc=128): 76%|ββββββββ | 37568/49172 [00:14<00:03, 3009.77 examples/s] Tokenizing Prompts (num_proc=128): 77%|ββββββββ | 37894/49172 [00:14<00:03, 2833.19 examples/s] Tokenizing Prompts (num_proc=128): 78%|ββββββββ | 38273/49172 [00:14<00:03, 2797.05 examples/s] Tokenizing Prompts (num_proc=128): 79%|ββββββββ | 38625/49172 [00:15<00:03, 2713.00 examples/s] Tokenizing Prompts (num_proc=128): 80%|ββββββββ | 39134/49172 [00:15<00:03, 2989.24 examples/s] Tokenizing Prompts (num_proc=128): 80%|ββββββββ | 39484/49172 [00:15<00:03, 2970.93 examples/s] Tokenizing Prompts (num_proc=128): 81%|ββββββββ | 39830/49172 [00:15<00:03, 2822.53 examples/s] Tokenizing Prompts (num_proc=128): 82%|βββββββββ | 40145/49172 [00:15<00:03, 2691.61 examples/s] Tokenizing Prompts (num_proc=128): 83%|βββββββββ | 40611/49172 [00:15<00:02, 2919.81 examples/s] Tokenizing Prompts (num_proc=128): 83%|βββββββββ | 40933/49172 [00:15<00:02, 2872.22 examples/s] Tokenizing Prompts (num_proc=128): 84%|βββββββββ | 41235/49172 [00:15<00:03, 2550.35 examples/s] Tokenizing Prompts (num_proc=128): 85%|βββββββββ | 41636/49172 [00:16<00:02, 2623.37 examples/s] Tokenizing Prompts (num_proc=128): 85%|βββββββββ | 42011/49172 [00:16<00:02, 2667.31 examples/s] Tokenizing Prompts (num_proc=128): 87%|βββββββββ | 42565/49172 [00:16<00:02, 3151.00 examples/s] Tokenizing Prompts (num_proc=128): 87%|βββββββββ | 42957/49172 [00:16<00:02, 3026.85 examples/s] Tokenizing Prompts (num_proc=128): 88%|βββββββββ | 43325/49172 [00:16<00:02, 2828.92 examples/s] Tokenizing Prompts (num_proc=128): 89%|βββββββββ | 43700/49172 [00:16<00:01, 2829.86 examples/s] Tokenizing Prompts (num_proc=128): 90%|βββββββββ | 44022/49172 [00:16<00:01, 2649.38 examples/s] Tokenizing Prompts (num_proc=128): 90%|βββββββββ | 44422/49172 [00:17<00:01, 2713.96 examples/s] Tokenizing Prompts (num_proc=128): 91%|βββββββββ | 44782/49172 [00:17<00:01, 2703.83 examples/s] Tokenizing Prompts (num_proc=128): 92%|ββββββββββ| 45155/49172 [00:17<00:01, 2707.69 examples/s] Tokenizing Prompts (num_proc=128): 93%|ββββββββββ| 45628/49172 [00:17<00:01, 2900.02 examples/s] Tokenizing Prompts (num_proc=128): 94%|ββββββββββ| 45986/49172 [00:17<00:01, 2826.26 examples/s] Tokenizing Prompts (num_proc=128): 94%|ββββββββββ| 46333/49172 [00:17<00:01, 2738.12 examples/s] Tokenizing Prompts (num_proc=128): 95%|ββββββββββ| 46787/49172 [00:17<00:00, 3126.02 examples/s] Tokenizing Prompts (num_proc=128): 96%|ββββββββββ| 47266/49172 [00:17<00:00, 3503.55 examples/s] Tokenizing Prompts (num_proc=128): 97%|ββββββββββ| 47643/49172 [00:18<00:00, 3338.62 examples/s] Tokenizing Prompts (num_proc=128): 98%|ββββββββββ| 48031/49172 [00:18<00:00, 3013.54 examples/s] Tokenizing Prompts (num_proc=128): 99%|ββββββββββ| 48485/49172 [00:18<00:00, 3381.58 examples/s] Tokenizing Prompts (num_proc=128): 99%|ββββββββββ| 48913/49172 [00:18<00:00, 3410.78 examples/s] Tokenizing Prompts (num_proc=128): 100%|ββββββββββ| 49172/49172 [00:19<00:00, 2564.69 examples/s] | |
| [2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:837642] min_input_len: 33 | |
| [2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:837642] max_input_len: 1051 | |
| Dropping Long Sequences (>8192) (num_proc=128): 0%| | 0/49172 [00:00<?, ? examples/s] Dropping Long Sequences (>8192) (num_proc=128): 1%| | 385/49172 [00:00<01:43, 471.65 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 5%|β | 2695/49172 [00:00<00:12, 3750.47 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 13%|ββ | 6545/49172 [00:01<00:04, 9405.16 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 19%|ββ | 9236/49172 [00:01<00:03, 12478.76 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 25%|βββ | 12308/49172 [00:01<00:02, 15931.93 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 30%|βββ | 14996/49172 [00:01<00:01, 17930.73 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 37%|ββββ | 18068/49172 [00:01<00:01, 20677.88 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 44%|βββββ | 21524/49172 [00:01<00:01, 23483.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 50%|βββββ | 24596/49172 [00:01<00:01, 23335.07 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 60%|ββββββ | 29588/49172 [00:01<00:00, 29166.32 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 69%|βββββββ | 33812/49172 [00:01<00:00, 31889.27 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 76%|ββββββββ | 37268/49172 [00:02<00:00, 32125.51 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 83%|βββββββββ | 40724/49172 [00:02<00:00, 31182.70 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 93%|ββββββββββ| 45716/49172 [00:02<00:00, 33653.89 examples/s] Dropping Long Sequences (>8192) (num_proc=128): 100%|ββββββββββ| 49172/49172 [00:02<00:00, 18442.91 examples/s] | |
| Saving the dataset (0/128 shards): 0%| | 0/49172 [00:00<?, ? examples/s] Saving the dataset (0/128 shards): 1%| | 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (1/128 shards): 1%| | 385/49172 [00:02<04:27, 182.62 examples/s] Saving the dataset (2/128 shards): 2%|β | 770/49172 [00:02<04:25, 182.62 examples/s] Saving the dataset (3/128 shards): 2%|β | 1155/49172 [00:02<04:22, 182.62 examples/s] Saving the dataset (4/128 shards): 3%|β | 1540/49172 [00:02<04:20, 182.62 examples/s] Saving the dataset (5/128 shards): 4%|β | 1925/49172 [00:02<04:18, 182.62 examples/s] Saving the dataset (6/128 shards): 5%|β | 2310/49172 [00:02<04:16, 182.62 examples/s] Saving the dataset (7/128 shards): 5%|β | 2695/49172 [00:02<04:14, 182.62 examples/s] Saving the dataset (8/128 shards): 6%|β | 3080/49172 [00:02<04:12, 182.62 examples/s] Saving the dataset (9/128 shards): 7%|β | 3465/49172 [00:02<04:10, 182.62 examples/s] Saving the dataset (10/128 shards): 8%|β | 3850/49172 [00:02<04:08, 182.62 examples/s] Saving the dataset (11/128 shards): 9%|β | 4235/49172 [00:02<04:06, 182.62 examples/s] Saving the dataset (12/128 shards): 9%|β | 4620/49172 [00:02<04:03, 182.62 examples/s] Saving the dataset (13/128 shards): 10%|β | 5005/49172 [00:02<04:01, 182.62 examples/s] Saving the dataset (14/128 shards): 11%|β | 5390/49172 [00:02<03:59, 182.62 examples/s] Saving the dataset (15/128 shards): 12%|ββ | 5775/49172 [00:02<03:57, 182.62 examples/s] Saving the dataset (16/128 shards): 13%|ββ | 6160/49172 [00:02<03:55, 182.62 examples/s] Saving the dataset (17/128 shards): 13%|ββ | 6545/49172 [00:02<03:53, 182.62 examples/s] Saving the dataset (18/128 shards): 14%|ββ | 6930/49172 [00:02<03:51, 182.62 examples/s] Saving the dataset (19/128 shards): 15%|ββ | 7315/49172 [00:02<03:49, 182.62 examples/s] Saving the dataset (20/128 shards): 16%|ββ | 7700/49172 [00:02<03:47, 182.62 examples/s] Saving the dataset (21/128 shards): 16%|ββ | 8084/49172 [00:02<03:44, 182.62 examples/s] Saving the dataset (22/128 shards): 17%|ββ | 8468/49172 [00:02<03:42, 182.62 examples/s] Saving the dataset (23/128 shards): 18%|ββ | 8852/49172 [00:02<03:40, 182.62 examples/s] Saving the dataset (24/128 shards): 19%|ββ | 9236/49172 [00:02<03:38, 182.62 examples/s] Saving the dataset (25/128 shards): 20%|ββ | 9620/49172 [00:02<03:36, 182.62 examples/s] Saving the dataset (26/128 shards): 20%|ββ | 10004/49172 [00:02<03:34, 182.62 examples/s] Saving the dataset (27/128 shards): 21%|ββ | 10388/49172 [00:02<03:32, 182.62 examples/s] Saving the dataset (28/128 shards): 22%|βββ | 10772/49172 [00:02<03:30, 182.62 examples/s] Saving the dataset (29/128 shards): 23%|βββ | 11156/49172 [00:02<03:28, 182.62 examples/s] Saving the dataset (30/128 shards): 23%|βββ | 11540/49172 [00:02<03:26, 182.62 examples/s] Saving the dataset (31/128 shards): 24%|βββ | 11924/49172 [00:02<03:23, 182.62 examples/s] Saving the dataset (32/128 shards): 25%|βββ | 12308/49172 [00:02<03:21, 182.62 examples/s] Saving the dataset (33/128 shards): 26%|βββ | 12692/49172 [00:02<03:19, 182.62 examples/s] Saving the dataset (34/128 shards): 27%|βββ | 13076/49172 [00:02<03:17, 182.62 examples/s] Saving the dataset (35/128 shards): 27%|βββ | 13460/49172 [00:02<03:15, 182.62 examples/s] Saving the dataset (36/128 shards): 28%|βββ | 13844/49172 [00:02<03:13, 182.62 examples/s] Saving the dataset (37/128 shards): 29%|βββ | 14228/49172 [00:02<03:11, 182.62 examples/s] Saving the dataset (38/128 shards): 30%|βββ | 14612/49172 [00:02<03:09, 182.62 examples/s] Saving the dataset (39/128 shards): 30%|βββ | 14996/49172 [00:02<03:07, 182.62 examples/s] Saving the dataset (40/128 shards): 31%|ββββ | 15380/49172 [00:02<03:05, 182.62 examples/s] Saving the dataset (41/128 shards): 32%|ββββ | 15764/49172 [00:02<03:02, 182.62 examples/s] Saving the dataset (42/128 shards): 33%|ββββ | 16148/49172 [00:02<03:00, 182.62 examples/s] Saving the dataset (43/128 shards): 34%|ββββ | 16532/49172 [00:02<02:58, 182.62 examples/s] Saving the dataset (44/128 shards): 34%|ββββ | 16916/49172 [00:02<02:56, 182.62 examples/s] Saving the dataset (45/128 shards): 35%|ββββ | 17300/49172 [00:02<02:54, 182.62 examples/s] Saving the dataset (46/128 shards): 36%|ββββ | 17684/49172 [00:02<02:52, 182.62 examples/s] Saving the dataset (47/128 shards): 37%|ββββ | 18068/49172 [00:02<02:50, 182.62 examples/s] Saving the dataset (48/128 shards): 38%|ββββ | 18452/49172 [00:02<02:48, 182.62 examples/s] Saving the dataset (49/128 shards): 38%|ββββ | 18836/49172 [00:02<02:46, 182.62 examples/s] Saving the dataset (50/128 shards): 39%|ββββ | 19220/49172 [00:02<02:44, 182.62 examples/s] Saving the dataset (51/128 shards): 40%|ββββ | 19604/49172 [00:02<02:41, 182.62 examples/s] Saving the dataset (52/128 shards): 41%|ββββ | 19988/49172 [00:02<02:39, 182.62 examples/s] Saving the dataset (53/128 shards): 41%|βββββ | 20372/49172 [00:02<02:37, 182.62 examples/s] Saving the dataset (54/128 shards): 42%|βββββ | 20756/49172 [00:02<02:35, 182.62 examples/s] Saving the dataset (55/128 shards): 43%|βββββ | 21140/49172 [00:02<02:33, 182.62 examples/s] Saving the dataset (56/128 shards): 44%|βββββ | 21524/49172 [00:02<02:31, 182.62 examples/s] Saving the dataset (57/128 shards): 45%|βββββ | 21908/49172 [00:02<02:29, 182.62 examples/s] Saving the dataset (58/128 shards): 45%|βββββ | 22292/49172 [00:02<02:27, 182.62 examples/s] Saving the dataset (59/128 shards): 46%|βββββ | 22676/49172 [00:02<02:25, 182.62 examples/s] Saving the dataset (60/128 shards): 47%|βββββ | 23060/49172 [00:02<02:22, 182.62 examples/s] Saving the dataset (61/128 shards): 48%|βββββ | 23444/49172 [00:02<02:20, 182.62 examples/s] Saving the dataset (62/128 shards): 48%|βββββ | 23828/49172 [00:02<02:18, 182.62 examples/s] Saving the dataset (63/128 shards): 49%|βββββ | 24212/49172 [00:02<02:16, 182.62 examples/s] Saving the dataset (64/128 shards): 50%|βββββ | 24596/49172 [00:02<02:14, 182.62 examples/s] Saving the dataset (65/128 shards): 51%|βββββ | 24980/49172 [00:02<02:12, 182.62 examples/s] Saving the dataset (66/128 shards): 52%|ββββββ | 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (67/128 shards): 52%|ββββββ | 25748/49172 [00:02<02:08, 182.62 examples/s] Saving the dataset (68/128 shards): 53%|ββββββ | 26132/49172 [00:02<02:06, 182.62 examples/s] Saving the dataset (69/128 shards): 54%|ββββββ | 26516/49172 [00:02<02:04, 182.62 examples/s] Saving the dataset (70/128 shards): 55%|ββββββ | 26900/49172 [00:02<02:01, 182.62 examples/s] Saving the dataset (71/128 shards): 55%|ββββββ | 27284/49172 [00:02<01:59, 182.62 examples/s] Saving the dataset (72/128 shards): 56%|ββββββ | 27668/49172 [00:02<01:57, 182.62 examples/s] Saving the dataset (73/128 shards): 57%|ββββββ | 28052/49172 [00:02<01:55, 182.62 examples/s] Saving the dataset (74/128 shards): 58%|ββββββ | 28436/49172 [00:02<01:53, 182.62 examples/s] Saving the dataset (75/128 shards): 59%|ββββββ | 28820/49172 [00:02<01:51, 182.62 examples/s] Saving the dataset (76/128 shards): 59%|ββββββ | 29204/49172 [00:02<01:49, 182.62 examples/s] Saving the dataset (77/128 shards): 60%|ββββββ | 29588/49172 [00:02<01:47, 182.62 examples/s] Saving the dataset (78/128 shards): 61%|ββββββ | 29972/49172 [00:02<01:45, 182.62 examples/s] Saving the dataset (79/128 shards): 62%|βββββββ | 30356/49172 [00:02<01:43, 182.62 examples/s] Saving the dataset (80/128 shards): 63%|βββββββ | 30740/49172 [00:02<01:40, 182.62 examples/s] Saving the dataset (81/128 shards): 63%|βββββββ | 31124/49172 [00:02<01:38, 182.62 examples/s] Saving the dataset (82/128 shards): 64%|βββββββ | 31508/49172 [00:02<01:36, 182.62 examples/s] Saving the dataset (83/128 shards): 65%|βββββββ | 31892/49172 [00:02<01:34, 182.62 examples/s] Saving the dataset (84/128 shards): 66%|βββββββ | 32276/49172 [00:02<01:32, 182.62 examples/s] Saving the dataset (85/128 shards): 66%|βββββββ | 32660/49172 [00:02<01:30, 182.62 examples/s] Saving the dataset (86/128 shards): 67%|βββββββ | 33044/49172 [00:02<01:28, 182.62 examples/s] Saving the dataset (87/128 shards): 68%|βββββββ | 33428/49172 [00:02<01:26, 182.62 examples/s] Saving the dataset (88/128 shards): 69%|βββββββ | 33812/49172 [00:02<01:24, 182.62 examples/s] Saving the dataset (89/128 shards): 70%|βββββββ | 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (90/128 shards): 70%|βββββββ | 34580/49172 [00:02<01:19, 182.62 examples/s] Saving the dataset (91/128 shards): 71%|βββββββ | 34964/49172 [00:02<01:17, 182.62 examples/s] Saving the dataset (92/128 shards): 72%|ββββββββ | 35348/49172 [00:02<01:15, 182.62 examples/s] Saving the dataset (93/128 shards): 73%|ββββββββ | 35732/49172 [00:02<01:13, 182.62 examples/s] Saving the dataset (94/128 shards): 73%|ββββββββ | 36116/49172 [00:02<01:11, 182.62 examples/s] Saving the dataset (95/128 shards): 74%|ββββββββ | 36500/49172 [00:02<01:09, 182.62 examples/s] Saving the dataset (96/128 shards): 75%|ββββββββ | 36884/49172 [00:02<01:07, 182.62 examples/s] Saving the dataset (97/128 shards): 76%|ββββββββ | 37268/49172 [00:02<01:05, 182.62 examples/s] Saving the dataset (98/128 shards): 77%|ββββββββ | 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (99/128 shards): 77%|ββββββββ | 38036/49172 [00:02<01:00, 182.62 examples/s] Saving the dataset (100/128 shards): 78%|ββββββββ | 38420/49172 [00:02<00:58, 182.62 examples/s] Saving the dataset (101/128 shards): 79%|ββββββββ | 38804/49172 [00:02<00:56, 182.62 examples/s] Saving the dataset (102/128 shards): 80%|ββββββββ | 39188/49172 [00:02<00:54, 182.62 examples/s] Saving the dataset (103/128 shards): 80%|ββββββββ | 39572/49172 [00:02<00:52, 182.62 examples/s] Saving the dataset (104/128 shards): 81%|βββββββββ | 39956/49172 [00:02<00:50, 182.62 examples/s] Saving the dataset (105/128 shards): 82%|βββββββββ | 40340/49172 [00:02<00:48, 182.62 examples/s] Saving the dataset (106/128 shards): 83%|βββββββββ | 40724/49172 [00:02<00:46, 182.62 examples/s] Saving the dataset (107/128 shards): 84%|βββββββββ | 41108/49172 [00:02<00:44, 182.62 examples/s] Saving the dataset (108/128 shards): 84%|βββββββββ | 41492/49172 [00:02<00:42, 182.62 examples/s] Saving the dataset (109/128 shards): 85%|βββββββββ | 41876/49172 [00:02<00:39, 182.62 examples/s] Saving the dataset (110/128 shards): 86%|βββββββββ | 42260/49172 [00:02<00:37, 182.62 examples/s] Saving the dataset (111/128 shards): 87%|βββββββββ | 42644/49172 [00:02<00:35, 182.62 examples/s] Saving the dataset (112/128 shards): 88%|βββββββββ | 43028/49172 [00:02<00:33, 182.62 examples/s] Saving the dataset (113/128 shards): 88%|βββββββββ | 43412/49172 [00:02<00:31, 182.62 examples/s] Saving the dataset (114/128 shards): 89%|βββββββββ | 43796/49172 [00:02<00:29, 182.62 examples/s] Saving the dataset (115/128 shards): 90%|βββββββββ | 44180/49172 [00:02<00:27, 182.62 examples/s] Saving the dataset (116/128 shards): 91%|ββββββββββ| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (117/128 shards): 91%|ββββββββββ| 44948/49172 [00:02<00:23, 182.62 examples/s] Saving the dataset (118/128 shards): 92%|ββββββββββ| 45332/49172 [00:02<00:21, 182.62 examples/s] Saving the dataset (119/128 shards): 93%|ββββββββββ| 45716/49172 [00:02<00:18, 182.62 examples/s] Saving the dataset (120/128 shards): 94%|ββββββββββ| 46100/49172 [00:02<00:16, 182.62 examples/s] Saving the dataset (121/128 shards): 95%|ββββββββββ| 46484/49172 [00:02<00:14, 182.62 examples/s] Saving the dataset (122/128 shards): 95%|ββββββββββ| 46868/49172 [00:02<00:12, 182.62 examples/s] Saving the dataset (123/128 shards): 96%|ββββββββββ| 47252/49172 [00:02<00:10, 182.62 examples/s] Saving the dataset (124/128 shards): 97%|ββββββββββ| 47636/49172 [00:02<00:08, 182.62 examples/s] Saving the dataset (125/128 shards): 98%|ββββββββββ| 48020/49172 [00:02<00:06, 182.62 examples/s] Saving the dataset (126/128 shards): 98%|ββββββββββ| 48404/49172 [00:02<00:04, 182.62 examples/s] Saving the dataset (127/128 shards): 99%|ββββββββββ| 48788/49172 [00:02<00:02, 182.62 examples/s] Saving the dataset (128/128 shards): 100%|ββββββββββ| 49172/49172 [00:02<00:00, 182.62 examples/s] Saving the dataset (128/128 shards): 100%|ββββββββββ| 49172/49172 [00:02<00:00, 22251.27 examples/s] | |
| [2025-11-06 16:12:34,233] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:837642] total_num_tokens: 9_208_425 | |
| [2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:837642] `total_supervised_tokens: 6_847_432` | |
| [2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:837642] total_num_steps: 769 | |
| [2025-11-06 16:12:34,425] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:837642] Maximum number of steps set at 769 | |
| [2025-11-06 16:12:34,441] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:837642] Loading tokenizer... meta-llama/Llama-3.2-3B | |
| [2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|> | |
| [2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|> | |
| [2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|> | |
| [2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None | |
| [2025-11-06 16:12:35,271] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference. | |
| [2025-11-06 16:12:35,271] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:837642] Loading model | |
| [2025-11-06 16:12:35,502] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:837642] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2025-11-06 16:12:35,503] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:837642] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2025-11-06 16:12:35,531] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:837642] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True} | |
| Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:00<00:00, 86.01it/s] | |
| [2025-11-06 16:15:46,317] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:837642] Converting modules to torch.bfloat16 | |
| [2025-11-06 16:15:59,472] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:837642] Memory usage after model load 0.000GB () | |
| [2025-11-06 16:16:00,600] [WARNING] [accelerate.utils.other.check_os_kernel:512] [PID:837642] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. | |
| [2025-11-06 16:16:07,189] [INFO] [axolotl.train.save_initial_configs:412] [PID:837642] Pre-saving tokenizer to ./outputs/qat_out/... | |
| [2025-11-06 16:16:07,288] [INFO] [axolotl.train.save_initial_configs:417] [PID:837642] Pre-saving model config to ./outputs/qat_out/... | |
| [2025-11-06 16:16:07,290] [INFO] [axolotl.train.execute_training:203] [PID:837642] Starting trainer... | |
| 0%| | 0/769 [00:00<?, ?it/s] 0%| | 1/769 [00:10<2:09:18, 10.10s/it] {'loss': 1.1473, 'grad_norm': 4.625, 'learning_rate': 0.0, 'memory/max_active (GiB)': 34.78, 'memory/max_allocated (GiB)': 34.78, 'memory/device_reserved (GiB)': 41.06, 'tokens_per_second_per_gpu': 879.01, 'epoch': 0.0} | |
| 0%| | 1/769 [00:10<2:09:18, 10.10s/it][2025-11-06 16:16:17,683] [INFO] [axolotl.core.trainers.base._save:671] [PID:837642] Saving model checkpoint to ./outputs/qat_out/checkpoint-1 | |
| 0%| | 2/769 [00:28<3:13:22, 15.13s/it] {'loss': 1.1048, 'grad_norm': 4.34375, 'learning_rate': 2.6315789473684213e-07, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 55.05, 'tokens_per_second_per_gpu': 1653.05, 'epoch': 0.0} | |
| 0%| | 2/769 [00:28<3:13:22, 15.13s/it] 0%| | 3/769 [00:35<2:23:03, 11.21s/it] {'loss': 1.1442, 'grad_norm': 4.6875, 'learning_rate': 5.263157894736843e-07, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1475.08, 'epoch': 0.0} | |
| 0%| | 3/769 [00:35<2:23:03, 11.21s/it] 1%| | 4/769 [00:40<1:55:11, 9.03s/it] {'loss': 1.1473, 'grad_norm': 3.671875, 'learning_rate': 7.894736842105263e-07, 'memory/max_active (GiB)': 46.8, 'memory/max_allocated (GiB)': 46.8, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 2080.31, 'epoch': 0.01} | |
| 1%| | 4/769 [00:41<1:55:11, 9.03s/it] 1%| | 5/769 [00:46<1:39:41, 7.83s/it] {'loss': 1.1704, 'grad_norm': 4.15625, 'learning_rate': 1.0526315789473685e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1673.31, 'epoch': 0.01} | |
| 1%| | 5/769 [00:46<1:39:41, 7.83s/it] 1%| | 6/769 [00:51<1:28:32, 6.96s/it] {'loss': 1.1557, 'grad_norm': 4.09375, 'learning_rate': 1.3157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1894.62, 'epoch': 0.01} | |
| 1%| | 6/769 [00:51<1:28:32, 6.96s/it] 1%| | 7/769 [00:57<1:20:49, 6.36s/it] {'loss': 1.1819, 'grad_norm': 5.21875, 'learning_rate': 1.5789473684210526e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.51, 'tokens_per_second_per_gpu': 1854.54, 'epoch': 0.01} | |
| 1%| | 7/769 [00:57<1:20:49, 6.36s/it] 1%| | 8/769 [01:02<1:18:07, 6.16s/it] {'loss': 1.1807, 'grad_norm': 4.125, 'learning_rate': 1.8421052631578948e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1751.39, 'epoch': 0.01} | |
| 1%| | 8/769 [01:02<1:18:07, 6.16s/it] 1%| | 9/769 [01:08<1:16:07, 6.01s/it] {'loss': 1.1324, 'grad_norm': 4.40625, 'learning_rate': 2.105263157894737e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1412.84, 'epoch': 0.01} | |
| 1%| | 9/769 [01:08<1:16:07, 6.01s/it] 1%|β | 10/769 [01:13<1:12:34, 5.74s/it] {'loss': 1.1556, 'grad_norm': 3.78125, 'learning_rate': 2.368421052631579e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1849.28, 'epoch': 0.01} | |
| 1%|β | 10/769 [01:13<1:12:34, 5.74s/it] 1%|β | 11/769 [01:19<1:12:17, 5.72s/it] {'loss': 1.1855, 'grad_norm': 3.859375, 'learning_rate': 2.631578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1624.94, 'epoch': 0.01} | |
| 1%|β | 11/769 [01:19<1:12:17, 5.72s/it] 2%|β | 12/769 [01:24<1:12:03, 5.71s/it] {'loss': 1.1024, 'grad_norm': 4.09375, 'learning_rate': 2.8947368421052634e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1463.62, 'epoch': 0.02} | |
| 2%|β | 12/769 [01:25<1:12:03, 5.71s/it] 2%|β | 13/769 [01:29<1:08:10, 5.41s/it] {'loss': 1.1591, 'grad_norm': 3.765625, 'learning_rate': 3.157894736842105e-06, 'memory/max_active (GiB)': 41.5, 'memory/max_allocated (GiB)': 41.5, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1910.89, 'epoch': 0.02} | |
| 2%|β | 13/769 [01:29<1:08:10, 5.41s/it] 2%|β | 14/769 [01:35<1:09:08, 5.49s/it] {'loss': 1.0844, 'grad_norm': 3.484375, 'learning_rate': 3.421052631578948e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1588.42, 'epoch': 0.02} | |
| 2%|β | 14/769 [01:35<1:09:08, 5.49s/it] 2%|β | 15/769 [01:39<1:05:28, 5.21s/it] {'loss': 1.2444, 'grad_norm': 3.984375, 'learning_rate': 3.6842105263157896e-06, 'memory/max_active (GiB)': 41.46, 'memory/max_allocated (GiB)': 41.46, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1568.51, 'epoch': 0.02} | |
| 2%|β | 15/769 [01:39<1:05:28, 5.21s/it] 2%|β | 16/769 [01:45<1:07:09, 5.35s/it] {'loss': 1.1609, 'grad_norm': 3.578125, 'learning_rate': 3.947368421052632e-06, 'memory/max_active (GiB)': 46.74, 'memory/max_allocated (GiB)': 46.74, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1338.16, 'epoch': 0.02} | |
| 2%|β | 16/769 [01:45<1:07:09, 5.35s/it] 2%|β | 17/769 [01:51<1:08:21, 5.45s/it] {'loss': 1.099, 'grad_norm': 3.125, 'learning_rate': 4.210526315789474e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1704.75, 'epoch': 0.02} | |
| 2%|β | 17/769 [01:51<1:08:21, 5.45s/it] 2%|β | 18/769 [01:57<1:09:09, 5.53s/it] {'loss': 1.0828, 'grad_norm': 3.046875, 'learning_rate': 4.473684210526316e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1664.81, 'epoch': 0.02} | |
| 2%|β | 18/769 [01:57<1:09:09, 5.53s/it] 2%|β | 19/769 [02:02<1:07:39, 5.41s/it] {'loss': 1.2319, 'grad_norm': 3.4375, 'learning_rate': 4.736842105263158e-06, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1602.68, 'epoch': 0.02} | |
| 2%|β | 19/769 [02:02<1:07:39, 5.41s/it] 3%|β | 20/769 [02:07<1:08:44, 5.51s/it] {'loss': 1.0638, 'grad_norm': 2.953125, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.13, 'tokens_per_second_per_gpu': 1755.41, 'epoch': 0.03} | |
| 3%|β | 20/769 [02:07<1:08:44, 5.51s/it] 3%|β | 21/769 [02:13<1:09:22, 5.56s/it] {'loss': 1.2072, 'grad_norm': 2.859375, 'learning_rate': 5.263157894736842e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1833.74, 'epoch': 0.03} | |
| 3%|β | 21/769 [02:13<1:09:22, 5.56s/it] 3%|β | 22/769 [02:18<1:07:37, 5.43s/it] {'loss': 1.2006, 'grad_norm': 3.109375, 'learning_rate': 5.526315789473685e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1567.66, 'epoch': 0.03} | |
| 3%|β | 22/769 [02:18<1:07:37, 5.43s/it] 3%|β | 23/769 [02:25<1:12:34, 5.84s/it] {'loss': 1.0081, 'grad_norm': 2.625, 'learning_rate': 5.789473684210527e-06, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1412.25, 'epoch': 0.03} | |
| 3%|β | 23/769 [02:25<1:12:34, 5.84s/it] 3%|β | 24/769 [02:31<1:12:04, 5.80s/it] {'loss': 1.1437, 'grad_norm': 2.6875, 'learning_rate': 6.0526315789473685e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.26, 'tokens_per_second_per_gpu': 1749.27, 'epoch': 0.03} | |
| 3%|β | 24/769 [02:31<1:12:04, 5.80s/it] 3%|β | 25/769 [02:36<1:11:33, 5.77s/it] {'loss': 1.0987, 'grad_norm': 2.765625, 'learning_rate': 6.31578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1627.64, 'epoch': 0.03} | |
| 3%|β | 25/769 [02:36<1:11:33, 5.77s/it] 3%|β | 26/769 [02:42<1:09:06, 5.58s/it] {'loss': 1.1046, 'grad_norm': 2.6875, 'learning_rate': 6.578947368421054e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1967.95, 'epoch': 0.03} | |
| 3%|β | 26/769 [02:42<1:09:06, 5.58s/it] 4%|β | 27/769 [02:47<1:09:27, 5.62s/it] {'loss': 1.1072, 'grad_norm': 2.8125, 'learning_rate': 6.842105263157896e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1594.62, 'epoch': 0.04} | |
| 4%|β | 27/769 [02:47<1:09:27, 5.62s/it] 4%|β | 28/769 [02:52<1:05:29, 5.30s/it] {'loss': 1.1974, 'grad_norm': 3.03125, 'learning_rate': 7.1052631578947375e-06, 'memory/max_active (GiB)': 41.47, 'memory/max_allocated (GiB)': 41.47, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1766.49, 'epoch': 0.04} | |
| 4%|β | 28/769 [02:52<1:05:29, 5.30s/it] 4%|β | 29/769 [02:58<1:06:52, 5.42s/it] {'loss': 1.2164, 'grad_norm': 2.734375, 'learning_rate': 7.368421052631579e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1593.41, 'epoch': 0.04} | |
| 4%|β | 29/769 [02:58<1:06:52, 5.42s/it] 4%|β | 30/769 [03:03<1:07:46, 5.50s/it] {'loss': 1.1324, 'grad_norm': 2.65625, 'learning_rate': 7.631578947368423e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1607.66, 'epoch': 0.04} | |
| 4%|β | 30/769 [03:03<1:07:46, 5.50s/it] 4%|β | 31/769 [03:09<1:08:21, 5.56s/it] {'loss': 1.0693, 'grad_norm': 2.96875, 'learning_rate': 7.894736842105265e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1454.28, 'epoch': 0.04} | |
| 4%|β | 31/769 [03:09<1:08:21, 5.56s/it] 4%|β | 32/769 [03:14<1:06:43, 5.43s/it] {'loss': 1.0988, 'grad_norm': 2.65625, 'learning_rate': 8.157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1972.66, 'epoch': 0.04} | |
| 4%|β | 32/769 [03:14<1:06:43, 5.43s/it] 4%|β | 33/769 [03:19<1:03:27, 5.17s/it] {'loss': 1.074, 'grad_norm': 2.84375, 'learning_rate': 8.421052631578948e-06, 'memory/max_active (GiB)': 41.48, 'memory/max_allocated (GiB)': 41.48, 'memory/device_reserved (GiB)': 77.13, 'tokens_per_second_per_gpu': 1705.8, 'epoch': 0.04} | |
| 4%|β | 33/769 [03:19<1:03:27, 5.17s/it] 4%|β | 34/769 [03:24<1:03:14, 5.16s/it] {'loss': 1.1673, 'grad_norm': 2.671875, 'learning_rate': 8.68421052631579e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 76.88, 'tokens_per_second_per_gpu': 1761.02, 'epoch': 0.04} | |
| 4%|β | 34/769 [03:24<1:03:14, 5.16s/it] 5%|β | 35/769 [03:29<1:02:59, 5.15s/it] {'loss': 1.1655, 'grad_norm': 3.03125, 'learning_rate': 8.947368421052632e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 76.01, 'tokens_per_second_per_gpu': 1479.42, 'epoch': 0.05} | |
| 5%|β | 35/769 [03:29<1:02:59, 5.15s/it] 5%|β | 36/769 [03:35<1:04:53, 5.31s/it] {'loss': 1.1483, 'grad_norm': 2.71875, 'learning_rate': 9.210526315789474e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1466.59, 'epoch': 0.05} | |
| 5%|β | 36/769 [03:35<1:04:53, 5.31s/it] 5%|β | 37/769 [03:40<1:04:09, 5.26s/it] {'loss': 1.159, 'grad_norm': 2.578125, 'learning_rate': 9.473684210526315e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1806.68, 'epoch': 0.05} | |
| 5%|β | 37/769 [03:40<1:04:09, 5.26s/it] 5%|β | 38/769 [03:45<1:05:41, 5.39s/it] {'loss': 1.1035, 'grad_norm': 2.625, 'learning_rate': 9.736842105263159e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1689.12, 'epoch': 0.05} | |
| 5%|β | 38/769 [03:45<1:05:41, 5.39s/it] 5%|β | 39/769 [03:51<1:06:41, 5.48s/it] {'loss': 1.0471, 'grad_norm': 2.609375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1579.8, 'epoch': 0.05} | |
| 5%|β | 39/769 [03:51<1:06:41, 5.48s/it] 5%|β | 40/769 [03:56<1:05:20, 5.38s/it] {'loss': 1.2091, 'grad_norm': 2.96875, 'learning_rate': 1.0263157894736844e-05, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1542.1, 'epoch': 0.05} | |
| 5%|β | 40/769 [03:56<1:05:20, 5.38s/it] 5%|β | 41/769 [04:02<1:06:25, 5.47s/it] {'loss': 1.0722, 'grad_norm': 2.59375, 'learning_rate': 1.0526315789473684e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1585.53, 'epoch': 0.05} | |
| 5%|β | 41/769 [04:02<1:06:25, 5.47s/it] |