[2025-11-06 16:11:41,516] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:837642] baseline 0.000GB ()
[2025-11-06 16:11:41,516] [INFO] [axolotl.cli.config.load_cfg:248] [PID:837642] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "3b-qat-nvfp4.yaml",
  "base_model": "meta-llama/Llama-3.2-3B",
  "base_model_config": "meta-llama/Llama-3.2-3B",
  "batch_size": 64,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "cosine_constant_lr_ratio": 0.0,
  "cosine_min_lr_ratio": 1.0,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_prepared_path": "./outputs/dataset_prepared",
  "dataset_processes": 128,
  "datasets": [
    {
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "yahma/alpaca-cleaned",
      "split": "train[:95%]",
      "trust_remote_code": false,
      "type": "alpaca"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 64,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "hub_model_id": "AlexHung29629/3b-qat-nvfp4",
  "include_tkps": true,
  "is_llama_derived_model": true,
  "learning_rate": 2e-05,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 64,
  "model_config_type": "llama",
  "num_epochs": 1.0,
  "optimizer": "adamw_torch_fused",
  "output_dir": "./outputs/qat_out/",
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qat": {
    "activation_dtype": "TorchAOQuantDType.nvfp4",
    "group_size": 16,
    "quantize_embedding": false,
    "weight_dtype": "TorchAOQuantDType.nvfp4"
  },
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_first_step": true,
  "save_only_model": true,
  "save_safetensors": true,
  "saves_per_epoch": 1,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "pad_token": "<|finetune_right_pad_id|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "meta-llama/Llama-3.2-3B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 1
}
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|>
[2025-11-06 16:11:46,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
[2025-11-06 16:11:46,489] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:837642] Unable to find prepared dataset in outputs/dataset_prepared/9bc662aed65b76546b2d635b3957a343
[2025-11-06 16:11:46,492] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:837642] Loading raw datasets...
[2025-11-06 16:11:46,492] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:837642] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]Generating train split: 100%|██████████| 51760/51760 [00:00<00:00, 229845.07 examples/s]Generating train split: 100%|██████████| 51760/51760 [00:00<00:00, 228708.96 examples/s]
[2025-11-06 16:12:08,982] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:837642] Loading dataset: yahma/alpaca-cleaned with base_type: alpaca and prompt_style: None
Tokenizing Prompts (num_proc=128):   0%|          | 0/49172 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=128):   0%|          | 84/49172 [00:01<13:38, 59.94 examples/s]Tokenizing Prompts (num_proc=128):   0%|          | 176/49172 [00:01<06:01, 135.38 examples/s]Tokenizing Prompts (num_proc=128):   2%|▏         | 792/49172 [00:01<01:05, 743.93 examples/s]Tokenizing Prompts (num_proc=128):   3%|▎         | 1289/49172 [00:01<00:39, 1224.84 examples/s]Tokenizing Prompts (num_proc=128):   3%|▎         | 1663/49172 [00:01<00:30, 1550.01 examples/s]Tokenizing Prompts (num_proc=128):   4%|▍         | 2053/49172 [00:02<00:25, 1838.37 examples/s]Tokenizing Prompts (num_proc=128):   5%|▌         | 2462/49172 [00:02<00:22, 2120.82 examples/s]Tokenizing Prompts (num_proc=128):   6%|▌         | 2931/49172 [00:02<00:17, 2593.78 examples/s]Tokenizing Prompts (num_proc=128):   7%|▋         | 3316/49172 [00:02<00:17, 2658.78 examples/s]Tokenizing Prompts (num_proc=128):   8%|▊         | 3704/49172 [00:02<00:16, 2738.71 examples/s]Tokenizing Prompts (num_proc=128):   8%|▊         | 4072/49172 [00:02<00:16, 2765.32 examples/s]Tokenizing Prompts (num_proc=128):   9%|▉         | 4392/49172 [00:02<00:16, 2672.66 examples/s]Tokenizing Prompts (num_proc=128):  10%|▉         | 4715/49172 [00:03<00:17, 2538.79 examples/s]Tokenizing Prompts (num_proc=128):  10%|█         | 5094/49172 [00:03<00:17, 2586.15 examples/s]Tokenizing Prompts (num_proc=128):  11%|█         | 5504/49172 [00:03<00:16, 2692.82 examples/s]Tokenizing Prompts (num_proc=128):  12%|█▏        | 5910/49172 [00:03<00:15, 2763.20 examples/s]Tokenizing Prompts (num_proc=128):  13%|█▎        | 6311/49172 [00:03<00:14, 2975.07 examples/s]Tokenizing Prompts (num_proc=128):  14%|█▎        | 6668/49172 [00:03<00:14, 2859.49 examples/s]Tokenizing Prompts (num_proc=128):  14%|█▍        | 7058/49172 [00:03<00:14, 2844.49 examples/s]Tokenizing Prompts (num_proc=128):  15%|█▌        | 7385/49172 [00:03<00:15, 2728.62 examples/s]Tokenizing Prompts (num_proc=128):  16%|█▌        | 7836/49172 [00:04<00:13, 3015.74 examples/s]Tokenizing Prompts (num_proc=128):  17%|█▋        | 8193/49172 [00:04<00:14, 2873.13 examples/s]Tokenizing Prompts (num_proc=128):  17%|█▋        | 8537/49172 [00:04<00:14, 2768.24 examples/s]Tokenizing Prompts (num_proc=128):  18%|█▊        | 8959/49172 [00:04<00:13, 2885.47 examples/s]Tokenizing Prompts (num_proc=128):  19%|█▉        | 9339/49172 [00:04<00:14, 2805.07 examples/s]Tokenizing Prompts (num_proc=128):  20%|█▉        | 9755/49172 [00:04<00:13, 2846.53 examples/s]Tokenizing Prompts (num_proc=128):  21%|██        | 10145/49172 [00:04<00:13, 2839.00 examples/s]Tokenizing Prompts (num_proc=128):  22%|██▏       | 10628/49172 [00:05<00:12, 3040.97 examples/s]Tokenizing Prompts (num_proc=128):  22%|██▏       | 11039/49172 [00:05<00:11, 3253.40 examples/s]Tokenizing Prompts (num_proc=128):  23%|██▎       | 11385/49172 [00:05<00:12, 2982.44 examples/s]Tokenizing Prompts (num_proc=128):  24%|██▍       | 11692/49172 [00:05<00:13, 2756.63 examples/s]Tokenizing Prompts (num_proc=128):  25%|██▍       | 12063/49172 [00:05<00:13, 2720.60 examples/s]Tokenizing Prompts (num_proc=128):  25%|██▌       | 12463/49172 [00:05<00:13, 2810.95 examples/s]Tokenizing Prompts (num_proc=128):  26%|██▌       | 12850/49172 [00:05<00:12, 2819.15 examples/s]Tokenizing Prompts (num_proc=128):  27%|██▋       | 13292/49172 [00:06<00:12, 2922.32 examples/s]Tokenizing Prompts (num_proc=128):  28%|██▊       | 13600/49172 [00:06<00:13, 2700.65 examples/s]Tokenizing Prompts (num_proc=128):  28%|██▊       | 13989/49172 [00:06<00:12, 2716.86 examples/s]Tokenizing Prompts (num_proc=128):  29%|██▉       | 14475/49172 [00:06<00:12, 2852.98 examples/s]Tokenizing Prompts (num_proc=128):  30%|███       | 14914/49172 [00:06<00:11, 2938.70 examples/s]Tokenizing Prompts (num_proc=128):  31%|███▏      | 15412/49172 [00:06<00:10, 3223.11 examples/s]Tokenizing Prompts (num_proc=128):  32%|███▏      | 15748/49172 [00:06<00:11, 3025.74 examples/s]Tokenizing Prompts (num_proc=128):  33%|███▎      | 16102/49172 [00:06<00:11, 2858.09 examples/s]Tokenizing Prompts (num_proc=128):  33%|███▎      | 16438/49172 [00:07<00:12, 2720.40 examples/s]Tokenizing Prompts (num_proc=128):  34%|███▍      | 16767/49172 [00:07<00:12, 2595.58 examples/s]Tokenizing Prompts (num_proc=128):  35%|███▍      | 17149/49172 [00:07<00:12, 2665.96 examples/s]Tokenizing Prompts (num_proc=128):  36%|███▌      | 17516/49172 [00:07<00:11, 2643.92 examples/s]Tokenizing Prompts (num_proc=128):  37%|███▋      | 17961/49172 [00:07<00:10, 2849.24 examples/s]Tokenizing Prompts (num_proc=128):  37%|███▋      | 18279/49172 [00:07<00:10, 2822.57 examples/s]Tokenizing Prompts (num_proc=128):  38%|███▊      | 18574/49172 [00:07<00:11, 2591.21 examples/s]Tokenizing Prompts (num_proc=128):  39%|███▊      | 18957/49172 [00:08<00:11, 2625.56 examples/s]Tokenizing Prompts (num_proc=128):  40%|███▉      | 19438/49172 [00:08<00:10, 2894.61 examples/s]Tokenizing Prompts (num_proc=128):  40%|████      | 19776/49172 [00:08<00:10, 2741.27 examples/s]Tokenizing Prompts (num_proc=128):  41%|████      | 20133/49172 [00:08<00:10, 2676.71 examples/s]Tokenizing Prompts (num_proc=128):  42%|████▏     | 20582/49172 [00:08<00:10, 2813.67 examples/s]Tokenizing Prompts (num_proc=128):  43%|████▎     | 21056/49172 [00:08<00:09, 2991.88 examples/s]Tokenizing Prompts (num_proc=128):  44%|████▎     | 21507/49172 [00:08<00:08, 3120.60 examples/s]Tokenizing Prompts (num_proc=128):  44%|████▍     | 21844/49172 [00:09<00:09, 2927.37 examples/s]Tokenizing Prompts (num_proc=128):  45%|████▌     | 22214/49172 [00:09<00:09, 2980.95 examples/s]Tokenizing Prompts (num_proc=128):  46%|████▌     | 22524/49172 [00:09<00:09, 2740.32 examples/s]Tokenizing Prompts (num_proc=128):  46%|████▋     | 22828/49172 [00:09<00:09, 2639.81 examples/s]Tokenizing Prompts (num_proc=128):  47%|████▋     | 23146/49172 [00:09<00:10, 2510.35 examples/s]Tokenizing Prompts (num_proc=128):  48%|████▊     | 23619/49172 [00:09<00:09, 2763.04 examples/s]Tokenizing Prompts (num_proc=128):  49%|████▉     | 24069/49172 [00:09<00:08, 2964.79 examples/s]Tokenizing Prompts (num_proc=128):  50%|████▉     | 24455/49172 [00:09<00:08, 2924.79 examples/s]Tokenizing Prompts (num_proc=128):  50%|█████     | 24809/49172 [00:10<00:08, 2856.24 examples/s]Tokenizing Prompts (num_proc=128):  51%|█████     | 25112/49172 [00:10<00:09, 2621.31 examples/s]Tokenizing Prompts (num_proc=128):  52%|█████▏    | 25522/49172 [00:10<00:08, 2714.48 examples/s]Tokenizing Prompts (num_proc=128):  53%|█████▎    | 25873/49172 [00:10<00:08, 2705.01 examples/s]Tokenizing Prompts (num_proc=128):  54%|█████▎    | 26322/49172 [00:10<00:07, 2907.63 examples/s]Tokenizing Prompts (num_proc=128):  54%|█████▍    | 26671/49172 [00:10<00:08, 2787.83 examples/s]Tokenizing Prompts (num_proc=128):  55%|█████▌    | 27127/49172 [00:10<00:07, 2923.69 examples/s]Tokenizing Prompts (num_proc=128):  56%|█████▌    | 27506/49172 [00:11<00:07, 2929.98 examples/s]Tokenizing Prompts (num_proc=128):  57%|█████▋    | 27928/49172 [00:11<00:07, 2972.71 examples/s]Tokenizing Prompts (num_proc=128):  58%|█████▊    | 28285/49172 [00:11<00:06, 3017.41 examples/s]Tokenizing Prompts (num_proc=128):  58%|█████▊    | 28589/49172 [00:11<00:07, 2769.13 examples/s]Tokenizing Prompts (num_proc=128):  59%|█████▉    | 28960/49172 [00:11<00:07, 2746.25 examples/s]Tokenizing Prompts (num_proc=128):  60%|█████▉    | 29343/49172 [00:11<00:07, 2735.06 examples/s]Tokenizing Prompts (num_proc=128):  60%|██████    | 29730/49172 [00:11<00:07, 2777.41 examples/s]Tokenizing Prompts (num_proc=128):  61%|██████▏   | 30201/49172 [00:11<00:06, 3027.86 examples/s]Tokenizing Prompts (num_proc=128):  62%|██████▏   | 30511/49172 [00:12<00:06, 2794.02 examples/s]Tokenizing Prompts (num_proc=128):  63%|██████▎   | 30963/49172 [00:12<00:06, 2860.65 examples/s]Tokenizing Prompts (num_proc=128):  64%|██████▎   | 31260/49172 [00:12<00:06, 2659.00 examples/s]Tokenizing Prompts (num_proc=128):  64%|██████▍   | 31661/49172 [00:12<00:06, 2744.06 examples/s]Tokenizing Prompts (num_proc=128):  65%|██████▌   | 32103/49172 [00:12<00:05, 2886.04 examples/s]Tokenizing Prompts (num_proc=128):  66%|██████▌   | 32523/49172 [00:12<00:05, 2982.93 examples/s]Tokenizing Prompts (num_proc=128):  67%|██████▋   | 32894/49172 [00:12<00:05, 2895.08 examples/s]Tokenizing Prompts (num_proc=128):  68%|██████▊   | 33277/49172 [00:13<00:05, 3037.84 examples/s]Tokenizing Prompts (num_proc=128):  68%|██████▊   | 33649/49172 [00:13<00:05, 2842.83 examples/s]Tokenizing Prompts (num_proc=128):  69%|██████▉   | 33971/49172 [00:13<00:05, 2690.27 examples/s]Tokenizing Prompts (num_proc=128):  70%|██████▉   | 34337/49172 [00:13<00:05, 2695.46 examples/s]Tokenizing Prompts (num_proc=128):  71%|███████   | 34730/49172 [00:13<00:05, 2719.08 examples/s]Tokenizing Prompts (num_proc=128):  72%|███████▏  | 35193/49172 [00:13<00:04, 2910.87 examples/s]Tokenizing Prompts (num_proc=128):  72%|███████▏  | 35583/49172 [00:13<00:04, 2876.12 examples/s]Tokenizing Prompts (num_proc=128):  73%|███████▎  | 35965/49172 [00:14<00:04, 2847.46 examples/s]Tokenizing Prompts (num_proc=128):  74%|███████▍  | 36341/49172 [00:14<00:04, 2829.40 examples/s]Tokenizing Prompts (num_proc=128):  75%|███████▍  | 36647/49172 [00:14<00:04, 2673.80 examples/s]Tokenizing Prompts (num_proc=128):  75%|███████▌  | 37103/49172 [00:14<00:04, 2839.03 examples/s]Tokenizing Prompts (num_proc=128):  76%|███████▋  | 37568/49172 [00:14<00:03, 3009.77 examples/s]Tokenizing Prompts (num_proc=128):  77%|███████▋  | 37894/49172 [00:14<00:03, 2833.19 examples/s]Tokenizing Prompts (num_proc=128):  78%|███████▊  | 38273/49172 [00:14<00:03, 2797.05 examples/s]Tokenizing Prompts (num_proc=128):  79%|███████▊  | 38625/49172 [00:15<00:03, 2713.00 examples/s]Tokenizing Prompts (num_proc=128):  80%|███████▉  | 39134/49172 [00:15<00:03, 2989.24 examples/s]Tokenizing Prompts (num_proc=128):  80%|████████  | 39484/49172 [00:15<00:03, 2970.93 examples/s]Tokenizing Prompts (num_proc=128):  81%|████████  | 39830/49172 [00:15<00:03, 2822.53 examples/s]Tokenizing Prompts (num_proc=128):  82%|████████▏ | 40145/49172 [00:15<00:03, 2691.61 examples/s]Tokenizing Prompts (num_proc=128):  83%|████████▎ | 40611/49172 [00:15<00:02, 2919.81 examples/s]Tokenizing Prompts (num_proc=128):  83%|████████▎ | 40933/49172 [00:15<00:02, 2872.22 examples/s]Tokenizing Prompts (num_proc=128):  84%|████████▍ | 41235/49172 [00:15<00:03, 2550.35 examples/s]Tokenizing Prompts (num_proc=128):  85%|████████▍ | 41636/49172 [00:16<00:02, 2623.37 examples/s]Tokenizing Prompts (num_proc=128):  85%|████████▌ | 42011/49172 [00:16<00:02, 2667.31 examples/s]Tokenizing Prompts (num_proc=128):  87%|████████▋ | 42565/49172 [00:16<00:02, 3151.00 examples/s]Tokenizing Prompts (num_proc=128):  87%|████████▋ | 42957/49172 [00:16<00:02, 3026.85 examples/s]Tokenizing Prompts (num_proc=128):  88%|████████▊ | 43325/49172 [00:16<00:02, 2828.92 examples/s]Tokenizing Prompts (num_proc=128):  89%|████████▉ | 43700/49172 [00:16<00:01, 2829.86 examples/s]Tokenizing Prompts (num_proc=128):  90%|████████▉ | 44022/49172 [00:16<00:01, 2649.38 examples/s]Tokenizing Prompts (num_proc=128):  90%|█████████ | 44422/49172 [00:17<00:01, 2713.96 examples/s]Tokenizing Prompts (num_proc=128):  91%|█████████ | 44782/49172 [00:17<00:01, 2703.83 examples/s]Tokenizing Prompts (num_proc=128):  92%|█████████▏| 45155/49172 [00:17<00:01, 2707.69 examples/s]Tokenizing Prompts (num_proc=128):  93%|█████████▎| 45628/49172 [00:17<00:01, 2900.02 examples/s]Tokenizing Prompts (num_proc=128):  94%|█████████▎| 45986/49172 [00:17<00:01, 2826.26 examples/s]Tokenizing Prompts (num_proc=128):  94%|█████████▍| 46333/49172 [00:17<00:01, 2738.12 examples/s]Tokenizing Prompts (num_proc=128):  95%|█████████▌| 46787/49172 [00:17<00:00, 3126.02 examples/s]Tokenizing Prompts (num_proc=128):  96%|█████████▌| 47266/49172 [00:17<00:00, 3503.55 examples/s]Tokenizing Prompts (num_proc=128):  97%|█████████▋| 47643/49172 [00:18<00:00, 3338.62 examples/s]Tokenizing Prompts (num_proc=128):  98%|█████████▊| 48031/49172 [00:18<00:00, 3013.54 examples/s]Tokenizing Prompts (num_proc=128):  99%|█████████▊| 48485/49172 [00:18<00:00, 3381.58 examples/s]Tokenizing Prompts (num_proc=128):  99%|█████████▉| 48913/49172 [00:18<00:00, 3410.78 examples/s]Tokenizing Prompts (num_proc=128): 100%|██████████| 49172/49172 [00:19<00:00, 2564.69 examples/s]
[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:837642] min_input_len: 33
[2025-11-06 16:12:28,393] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:837642] max_input_len: 1051
Dropping Long Sequences (>8192) (num_proc=128):   0%|          | 0/49172 [00:00<?, ? examples/s]Dropping Long Sequences (>8192) (num_proc=128):   1%|          | 385/49172 [00:00<01:43, 471.65 examples/s]Dropping Long Sequences (>8192) (num_proc=128):   5%|▌         | 2695/49172 [00:00<00:12, 3750.47 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  13%|█▎        | 6545/49172 [00:01<00:04, 9405.16 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  19%|█▉        | 9236/49172 [00:01<00:03, 12478.76 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  25%|██▌       | 12308/49172 [00:01<00:02, 15931.93 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  30%|███       | 14996/49172 [00:01<00:01, 17930.73 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  37%|███▋      | 18068/49172 [00:01<00:01, 20677.88 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  44%|████▍     | 21524/49172 [00:01<00:01, 23483.32 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  50%|█████     | 24596/49172 [00:01<00:01, 23335.07 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  60%|██████    | 29588/49172 [00:01<00:00, 29166.32 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  69%|██████▉   | 33812/49172 [00:01<00:00, 31889.27 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  76%|███████▌  | 37268/49172 [00:02<00:00, 32125.51 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  83%|████████▎ | 40724/49172 [00:02<00:00, 31182.70 examples/s]Dropping Long Sequences (>8192) (num_proc=128):  93%|█████████▎| 45716/49172 [00:02<00:00, 33653.89 examples/s]Dropping Long Sequences (>8192) (num_proc=128): 100%|██████████| 49172/49172 [00:02<00:00, 18442.91 examples/s]
Saving the dataset (0/128 shards):   0%|          | 0/49172 [00:00<?, ? examples/s]Saving the dataset (0/128 shards):   1%|          | 385/49172 [00:02<04:27, 182.62 examples/s]Saving the dataset (1/128 shards):   1%|          | 385/49172 [00:02<04:27, 182.62 examples/s]Saving the dataset (2/128 shards):   2%|▏         | 770/49172 [00:02<04:25, 182.62 examples/s]Saving the dataset (3/128 shards):   2%|▏         | 1155/49172 [00:02<04:22, 182.62 examples/s]Saving the dataset (4/128 shards):   3%|▎         | 1540/49172 [00:02<04:20, 182.62 examples/s]Saving the dataset (5/128 shards):   4%|▍         | 1925/49172 [00:02<04:18, 182.62 examples/s]Saving the dataset (6/128 shards):   5%|▍         | 2310/49172 [00:02<04:16, 182.62 examples/s]Saving the dataset (7/128 shards):   5%|▌         | 2695/49172 [00:02<04:14, 182.62 examples/s]Saving the dataset (8/128 shards):   6%|▋         | 3080/49172 [00:02<04:12, 182.62 examples/s]Saving the dataset (9/128 shards):   7%|▋         | 3465/49172 [00:02<04:10, 182.62 examples/s]Saving the dataset (10/128 shards):   8%|▊         | 3850/49172 [00:02<04:08, 182.62 examples/s]Saving the dataset (11/128 shards):   9%|▊         | 4235/49172 [00:02<04:06, 182.62 examples/s]Saving the dataset (12/128 shards):   9%|▉         | 4620/49172 [00:02<04:03, 182.62 examples/s]Saving the dataset (13/128 shards):  10%|█         | 5005/49172 [00:02<04:01, 182.62 examples/s]Saving the dataset (14/128 shards):  11%|█         | 5390/49172 [00:02<03:59, 182.62 examples/s]Saving the dataset (15/128 shards):  12%|█▏        | 5775/49172 [00:02<03:57, 182.62 examples/s]Saving the dataset (16/128 shards):  13%|█▎        | 6160/49172 [00:02<03:55, 182.62 examples/s]Saving the dataset (17/128 shards):  13%|█▎        | 6545/49172 [00:02<03:53, 182.62 examples/s]Saving the dataset (18/128 shards):  14%|█▍        | 6930/49172 [00:02<03:51, 182.62 examples/s]Saving the dataset (19/128 shards):  15%|█▍        | 7315/49172 [00:02<03:49, 182.62 examples/s]Saving the dataset (20/128 shards):  16%|█▌        | 7700/49172 [00:02<03:47, 182.62 examples/s]Saving the dataset (21/128 shards):  16%|█▋        | 8084/49172 [00:02<03:44, 182.62 examples/s]Saving the dataset (22/128 shards):  17%|█▋        | 8468/49172 [00:02<03:42, 182.62 examples/s]Saving the dataset (23/128 shards):  18%|█▊        | 8852/49172 [00:02<03:40, 182.62 examples/s]Saving the dataset (24/128 shards):  19%|█▉        | 9236/49172 [00:02<03:38, 182.62 examples/s]Saving the dataset (25/128 shards):  20%|█▉        | 9620/49172 [00:02<03:36, 182.62 examples/s]Saving the dataset (26/128 shards):  20%|██        | 10004/49172 [00:02<03:34, 182.62 examples/s]Saving the dataset (27/128 shards):  21%|██        | 10388/49172 [00:02<03:32, 182.62 examples/s]Saving the dataset (28/128 shards):  22%|██▏       | 10772/49172 [00:02<03:30, 182.62 examples/s]Saving the dataset (29/128 shards):  23%|██▎       | 11156/49172 [00:02<03:28, 182.62 examples/s]Saving the dataset (30/128 shards):  23%|██▎       | 11540/49172 [00:02<03:26, 182.62 examples/s]Saving the dataset (31/128 shards):  24%|██▍       | 11924/49172 [00:02<03:23, 182.62 examples/s]Saving the dataset (32/128 shards):  25%|██▌       | 12308/49172 [00:02<03:21, 182.62 examples/s]Saving the dataset (33/128 shards):  26%|██▌       | 12692/49172 [00:02<03:19, 182.62 examples/s]Saving the dataset (34/128 shards):  27%|██▋       | 13076/49172 [00:02<03:17, 182.62 examples/s]Saving the dataset (35/128 shards):  27%|██▋       | 13460/49172 [00:02<03:15, 182.62 examples/s]Saving the dataset (36/128 shards):  28%|██▊       | 13844/49172 [00:02<03:13, 182.62 examples/s]Saving the dataset (37/128 shards):  29%|██▉       | 14228/49172 [00:02<03:11, 182.62 examples/s]Saving the dataset (38/128 shards):  30%|██▉       | 14612/49172 [00:02<03:09, 182.62 examples/s]Saving the dataset (39/128 shards):  30%|███       | 14996/49172 [00:02<03:07, 182.62 examples/s]Saving the dataset (40/128 shards):  31%|███▏      | 15380/49172 [00:02<03:05, 182.62 examples/s]Saving the dataset (41/128 shards):  32%|███▏      | 15764/49172 [00:02<03:02, 182.62 examples/s]Saving the dataset (42/128 shards):  33%|███▎      | 16148/49172 [00:02<03:00, 182.62 examples/s]Saving the dataset (43/128 shards):  34%|███▎      | 16532/49172 [00:02<02:58, 182.62 examples/s]Saving the dataset (44/128 shards):  34%|███▍      | 16916/49172 [00:02<02:56, 182.62 examples/s]Saving the dataset (45/128 shards):  35%|███▌      | 17300/49172 [00:02<02:54, 182.62 examples/s]Saving the dataset (46/128 shards):  36%|███▌      | 17684/49172 [00:02<02:52, 182.62 examples/s]Saving the dataset (47/128 shards):  37%|███▋      | 18068/49172 [00:02<02:50, 182.62 examples/s]Saving the dataset (48/128 shards):  38%|███▊      | 18452/49172 [00:02<02:48, 182.62 examples/s]Saving the dataset (49/128 shards):  38%|███▊      | 18836/49172 [00:02<02:46, 182.62 examples/s]Saving the dataset (50/128 shards):  39%|███▉      | 19220/49172 [00:02<02:44, 182.62 examples/s]Saving the dataset (51/128 shards):  40%|███▉      | 19604/49172 [00:02<02:41, 182.62 examples/s]Saving the dataset (52/128 shards):  41%|████      | 19988/49172 [00:02<02:39, 182.62 examples/s]Saving the dataset (53/128 shards):  41%|████▏     | 20372/49172 [00:02<02:37, 182.62 examples/s]Saving the dataset (54/128 shards):  42%|████▏     | 20756/49172 [00:02<02:35, 182.62 examples/s]Saving the dataset (55/128 shards):  43%|████▎     | 21140/49172 [00:02<02:33, 182.62 examples/s]Saving the dataset (56/128 shards):  44%|████▍     | 21524/49172 [00:02<02:31, 182.62 examples/s]Saving the dataset (57/128 shards):  45%|████▍     | 21908/49172 [00:02<02:29, 182.62 examples/s]Saving the dataset (58/128 shards):  45%|████▌     | 22292/49172 [00:02<02:27, 182.62 examples/s]Saving the dataset (59/128 shards):  46%|████▌     | 22676/49172 [00:02<02:25, 182.62 examples/s]Saving the dataset (60/128 shards):  47%|████▋     | 23060/49172 [00:02<02:22, 182.62 examples/s]Saving the dataset (61/128 shards):  48%|████▊     | 23444/49172 [00:02<02:20, 182.62 examples/s]Saving the dataset (62/128 shards):  48%|████▊     | 23828/49172 [00:02<02:18, 182.62 examples/s]Saving the dataset (63/128 shards):  49%|████▉     | 24212/49172 [00:02<02:16, 182.62 examples/s]Saving the dataset (64/128 shards):  50%|█████     | 24596/49172 [00:02<02:14, 182.62 examples/s]Saving the dataset (65/128 shards):  51%|█████     | 24980/49172 [00:02<02:12, 182.62 examples/s]Saving the dataset (66/128 shards):  52%|█████▏    | 25748/49172 [00:02<02:08, 182.62 examples/s]Saving the dataset (67/128 shards):  52%|█████▏    | 25748/49172 [00:02<02:08, 182.62 examples/s]Saving the dataset (68/128 shards):  53%|█████▎    | 26132/49172 [00:02<02:06, 182.62 examples/s]Saving the dataset (69/128 shards):  54%|█████▍    | 26516/49172 [00:02<02:04, 182.62 examples/s]Saving the dataset (70/128 shards):  55%|█████▍    | 26900/49172 [00:02<02:01, 182.62 examples/s]Saving the dataset (71/128 shards):  55%|█████▌    | 27284/49172 [00:02<01:59, 182.62 examples/s]Saving the dataset (72/128 shards):  56%|█████▋    | 27668/49172 [00:02<01:57, 182.62 examples/s]Saving the dataset (73/128 shards):  57%|█████▋    | 28052/49172 [00:02<01:55, 182.62 examples/s]Saving the dataset (74/128 shards):  58%|█████▊    | 28436/49172 [00:02<01:53, 182.62 examples/s]Saving the dataset (75/128 shards):  59%|█████▊    | 28820/49172 [00:02<01:51, 182.62 examples/s]Saving the dataset (76/128 shards):  59%|█████▉    | 29204/49172 [00:02<01:49, 182.62 examples/s]Saving the dataset (77/128 shards):  60%|██████    | 29588/49172 [00:02<01:47, 182.62 examples/s]Saving the dataset (78/128 shards):  61%|██████    | 29972/49172 [00:02<01:45, 182.62 examples/s]Saving the dataset (79/128 shards):  62%|██████▏   | 30356/49172 [00:02<01:43, 182.62 examples/s]Saving the dataset (80/128 shards):  63%|██████▎   | 30740/49172 [00:02<01:40, 182.62 examples/s]Saving the dataset (81/128 shards):  63%|██████▎   | 31124/49172 [00:02<01:38, 182.62 examples/s]Saving the dataset (82/128 shards):  64%|██████▍   | 31508/49172 [00:02<01:36, 182.62 examples/s]Saving the dataset (83/128 shards):  65%|██████▍   | 31892/49172 [00:02<01:34, 182.62 examples/s]Saving the dataset (84/128 shards):  66%|██████▌   | 32276/49172 [00:02<01:32, 182.62 examples/s]Saving the dataset (85/128 shards):  66%|██████▋   | 32660/49172 [00:02<01:30, 182.62 examples/s]Saving the dataset (86/128 shards):  67%|██████▋   | 33044/49172 [00:02<01:28, 182.62 examples/s]Saving the dataset (87/128 shards):  68%|██████▊   | 33428/49172 [00:02<01:26, 182.62 examples/s]Saving the dataset (88/128 shards):  69%|██████▉   | 33812/49172 [00:02<01:24, 182.62 examples/s]Saving the dataset (89/128 shards):  70%|███████   | 34580/49172 [00:02<01:19, 182.62 examples/s]Saving the dataset (90/128 shards):  70%|███████   | 34580/49172 [00:02<01:19, 182.62 examples/s]Saving the dataset (91/128 shards):  71%|███████   | 34964/49172 [00:02<01:17, 182.62 examples/s]Saving the dataset (92/128 shards):  72%|███████▏  | 35348/49172 [00:02<01:15, 182.62 examples/s]Saving the dataset (93/128 shards):  73%|███████▎  | 35732/49172 [00:02<01:13, 182.62 examples/s]Saving the dataset (94/128 shards):  73%|███████▎  | 36116/49172 [00:02<01:11, 182.62 examples/s]Saving the dataset (95/128 shards):  74%|███████▍  | 36500/49172 [00:02<01:09, 182.62 examples/s]Saving the dataset (96/128 shards):  75%|███████▌  | 36884/49172 [00:02<01:07, 182.62 examples/s]Saving the dataset (97/128 shards):  76%|███████▌  | 37268/49172 [00:02<01:05, 182.62 examples/s]Saving the dataset (98/128 shards):  77%|███████▋  | 38036/49172 [00:02<01:00, 182.62 examples/s]Saving the dataset (99/128 shards):  77%|███████▋  | 38036/49172 [00:02<01:00, 182.62 examples/s]Saving the dataset (100/128 shards):  78%|███████▊  | 38420/49172 [00:02<00:58, 182.62 examples/s]Saving the dataset (101/128 shards):  79%|███████▉  | 38804/49172 [00:02<00:56, 182.62 examples/s]Saving the dataset (102/128 shards):  80%|███████▉  | 39188/49172 [00:02<00:54, 182.62 examples/s]Saving the dataset (103/128 shards):  80%|████████  | 39572/49172 [00:02<00:52, 182.62 examples/s]Saving the dataset (104/128 shards):  81%|████████▏ | 39956/49172 [00:02<00:50, 182.62 examples/s]Saving the dataset (105/128 shards):  82%|████████▏ | 40340/49172 [00:02<00:48, 182.62 examples/s]Saving the dataset (106/128 shards):  83%|████████▎ | 40724/49172 [00:02<00:46, 182.62 examples/s]Saving the dataset (107/128 shards):  84%|████████▎ | 41108/49172 [00:02<00:44, 182.62 examples/s]Saving the dataset (108/128 shards):  84%|████████▍ | 41492/49172 [00:02<00:42, 182.62 examples/s]Saving the dataset (109/128 shards):  85%|████████▌ | 41876/49172 [00:02<00:39, 182.62 examples/s]Saving the dataset (110/128 shards):  86%|████████▌ | 42260/49172 [00:02<00:37, 182.62 examples/s]Saving the dataset (111/128 shards):  87%|████████▋ | 42644/49172 [00:02<00:35, 182.62 examples/s]Saving the dataset (112/128 shards):  88%|████████▊ | 43028/49172 [00:02<00:33, 182.62 examples/s]Saving the dataset (113/128 shards):  88%|████████▊ | 43412/49172 [00:02<00:31, 182.62 examples/s]Saving the dataset (114/128 shards):  89%|████████▉ | 43796/49172 [00:02<00:29, 182.62 examples/s]Saving the dataset (115/128 shards):  90%|████████▉ | 44180/49172 [00:02<00:27, 182.62 examples/s]Saving the dataset (116/128 shards):  91%|█████████▏| 44948/49172 [00:02<00:23, 182.62 examples/s]Saving the dataset (117/128 shards):  91%|█████████▏| 44948/49172 [00:02<00:23, 182.62 examples/s]Saving the dataset (118/128 shards):  92%|█████████▏| 45332/49172 [00:02<00:21, 182.62 examples/s]Saving the dataset (119/128 shards):  93%|█████████▎| 45716/49172 [00:02<00:18, 182.62 examples/s]Saving the dataset (120/128 shards):  94%|█████████▍| 46100/49172 [00:02<00:16, 182.62 examples/s]Saving the dataset (121/128 shards):  95%|█████████▍| 46484/49172 [00:02<00:14, 182.62 examples/s]Saving the dataset (122/128 shards):  95%|█████████▌| 46868/49172 [00:02<00:12, 182.62 examples/s]Saving the dataset (123/128 shards):  96%|█████████▌| 47252/49172 [00:02<00:10, 182.62 examples/s]Saving the dataset (124/128 shards):  97%|█████████▋| 47636/49172 [00:02<00:08, 182.62 examples/s]Saving the dataset (125/128 shards):  98%|█████████▊| 48020/49172 [00:02<00:06, 182.62 examples/s]Saving the dataset (126/128 shards):  98%|█████████▊| 48404/49172 [00:02<00:04, 182.62 examples/s]Saving the dataset (127/128 shards):  99%|█████████▉| 48788/49172 [00:02<00:02, 182.62 examples/s]Saving the dataset (128/128 shards): 100%|██████████| 49172/49172 [00:02<00:00, 182.62 examples/s]Saving the dataset (128/128 shards): 100%|██████████| 49172/49172 [00:02<00:00, 22251.27 examples/s]
[2025-11-06 16:12:34,233] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:837642] total_num_tokens: 9_208_425
[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:837642] `total_supervised_tokens: 6_847_432`
[2025-11-06 16:12:34,425] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:837642] total_num_steps: 769
[2025-11-06 16:12:34,425] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:837642] Maximum number of steps set at 769
[2025-11-06 16:12:34,441] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:837642] Loading tokenizer... meta-llama/Llama-3.2-3B
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:837642] EOS: 128001 / <|end_of_text|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:837642] BOS: 128000 / <|begin_of_text|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:837642] PAD: 128004 / <|finetune_right_pad_id|>
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:837642] UNK: None / None
[2025-11-06 16:12:35,271] [INFO] [axolotl.loaders.tokenizer.load_tokenizer:295] [PID:837642] No Chat template selected. Consider adding a chat template for easier inference.
[2025-11-06 16:12:35,271] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:837642] Loading model
[2025-11-06 16:12:35,502] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:837642] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-11-06 16:12:35,503] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:837642] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-11-06 16:12:35,531] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:837642] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 86.01it/s]
[2025-11-06 16:15:46,317] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:837642] Converting modules to torch.bfloat16
[2025-11-06 16:15:59,472] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:837642] Memory usage after model load 0.000GB ()
[2025-11-06 16:16:00,600] [WARNING] [accelerate.utils.other.check_os_kernel:512] [PID:837642] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[2025-11-06 16:16:07,189] [INFO] [axolotl.train.save_initial_configs:412] [PID:837642] Pre-saving tokenizer to ./outputs/qat_out/...
[2025-11-06 16:16:07,288] [INFO] [axolotl.train.save_initial_configs:417] [PID:837642] Pre-saving model config to ./outputs/qat_out/...
[2025-11-06 16:16:07,290] [INFO] [axolotl.train.execute_training:203] [PID:837642] Starting trainer...
  0%|          | 0/769 [00:00<?, ?it/s]  0%|          | 1/769 [00:10<2:09:18, 10.10s/it]                                                 {'loss': 1.1473, 'grad_norm': 4.625, 'learning_rate': 0.0, 'memory/max_active (GiB)': 34.78, 'memory/max_allocated (GiB)': 34.78, 'memory/device_reserved (GiB)': 41.06, 'tokens_per_second_per_gpu': 879.01, 'epoch': 0.0}
  0%|          | 1/769 [00:10<2:09:18, 10.10s/it][2025-11-06 16:16:17,683] [INFO] [axolotl.core.trainers.base._save:671] [PID:837642] Saving model checkpoint to ./outputs/qat_out/checkpoint-1
  0%|          | 2/769 [00:28<3:13:22, 15.13s/it]                                                 {'loss': 1.1048, 'grad_norm': 4.34375, 'learning_rate': 2.6315789473684213e-07, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 55.05, 'tokens_per_second_per_gpu': 1653.05, 'epoch': 0.0}
  0%|          | 2/769 [00:28<3:13:22, 15.13s/it]  0%|          | 3/769 [00:35<2:23:03, 11.21s/it]                                                 {'loss': 1.1442, 'grad_norm': 4.6875, 'learning_rate': 5.263157894736843e-07, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1475.08, 'epoch': 0.0}
  0%|          | 3/769 [00:35<2:23:03, 11.21s/it]  1%|          | 4/769 [00:40<1:55:11,  9.03s/it]                                                 {'loss': 1.1473, 'grad_norm': 3.671875, 'learning_rate': 7.894736842105263e-07, 'memory/max_active (GiB)': 46.8, 'memory/max_allocated (GiB)': 46.8, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 2080.31, 'epoch': 0.01}
  1%|          | 4/769 [00:41<1:55:11,  9.03s/it]  1%|          | 5/769 [00:46<1:39:41,  7.83s/it]                                                 {'loss': 1.1704, 'grad_norm': 4.15625, 'learning_rate': 1.0526315789473685e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 76.38, 'tokens_per_second_per_gpu': 1673.31, 'epoch': 0.01}
  1%|          | 5/769 [00:46<1:39:41,  7.83s/it]  1%|          | 6/769 [00:51<1:28:32,  6.96s/it]                                                 {'loss': 1.1557, 'grad_norm': 4.09375, 'learning_rate': 1.3157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1894.62, 'epoch': 0.01}
  1%|          | 6/769 [00:51<1:28:32,  6.96s/it]  1%|          | 7/769 [00:57<1:20:49,  6.36s/it]                                                 {'loss': 1.1819, 'grad_norm': 5.21875, 'learning_rate': 1.5789473684210526e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.51, 'tokens_per_second_per_gpu': 1854.54, 'epoch': 0.01}
  1%|          | 7/769 [00:57<1:20:49,  6.36s/it]  1%|          | 8/769 [01:02<1:18:07,  6.16s/it]                                                 {'loss': 1.1807, 'grad_norm': 4.125, 'learning_rate': 1.8421052631578948e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1751.39, 'epoch': 0.01}
  1%|          | 8/769 [01:02<1:18:07,  6.16s/it]  1%|          | 9/769 [01:08<1:16:07,  6.01s/it]                                                 {'loss': 1.1324, 'grad_norm': 4.40625, 'learning_rate': 2.105263157894737e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1412.84, 'epoch': 0.01}
  1%|          | 9/769 [01:08<1:16:07,  6.01s/it]  1%|▏         | 10/769 [01:13<1:12:34,  5.74s/it]                                                  {'loss': 1.1556, 'grad_norm': 3.78125, 'learning_rate': 2.368421052631579e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1849.28, 'epoch': 0.01}
  1%|▏         | 10/769 [01:13<1:12:34,  5.74s/it]  1%|▏         | 11/769 [01:19<1:12:17,  5.72s/it]                                                  {'loss': 1.1855, 'grad_norm': 3.859375, 'learning_rate': 2.631578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1624.94, 'epoch': 0.01}
  1%|▏         | 11/769 [01:19<1:12:17,  5.72s/it]  2%|▏         | 12/769 [01:24<1:12:03,  5.71s/it]                                                  {'loss': 1.1024, 'grad_norm': 4.09375, 'learning_rate': 2.8947368421052634e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1463.62, 'epoch': 0.02}
  2%|▏         | 12/769 [01:25<1:12:03,  5.71s/it]  2%|▏         | 13/769 [01:29<1:08:10,  5.41s/it]                                                  {'loss': 1.1591, 'grad_norm': 3.765625, 'learning_rate': 3.157894736842105e-06, 'memory/max_active (GiB)': 41.5, 'memory/max_allocated (GiB)': 41.5, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1910.89, 'epoch': 0.02}
  2%|▏         | 13/769 [01:29<1:08:10,  5.41s/it]  2%|▏         | 14/769 [01:35<1:09:08,  5.49s/it]                                                  {'loss': 1.0844, 'grad_norm': 3.484375, 'learning_rate': 3.421052631578948e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1588.42, 'epoch': 0.02}
  2%|▏         | 14/769 [01:35<1:09:08,  5.49s/it]  2%|▏         | 15/769 [01:39<1:05:28,  5.21s/it]                                                  {'loss': 1.2444, 'grad_norm': 3.984375, 'learning_rate': 3.6842105263157896e-06, 'memory/max_active (GiB)': 41.46, 'memory/max_allocated (GiB)': 41.46, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1568.51, 'epoch': 0.02}
  2%|▏         | 15/769 [01:39<1:05:28,  5.21s/it]  2%|▏         | 16/769 [01:45<1:07:09,  5.35s/it]                                                  {'loss': 1.1609, 'grad_norm': 3.578125, 'learning_rate': 3.947368421052632e-06, 'memory/max_active (GiB)': 46.74, 'memory/max_allocated (GiB)': 46.74, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1338.16, 'epoch': 0.02}
  2%|▏         | 16/769 [01:45<1:07:09,  5.35s/it]  2%|▏         | 17/769 [01:51<1:08:21,  5.45s/it]                                                  {'loss': 1.099, 'grad_norm': 3.125, 'learning_rate': 4.210526315789474e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1704.75, 'epoch': 0.02}
  2%|▏         | 17/769 [01:51<1:08:21,  5.45s/it]  2%|▏         | 18/769 [01:57<1:09:09,  5.53s/it]                                                  {'loss': 1.0828, 'grad_norm': 3.046875, 'learning_rate': 4.473684210526316e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.88, 'tokens_per_second_per_gpu': 1664.81, 'epoch': 0.02}
  2%|▏         | 18/769 [01:57<1:09:09,  5.53s/it]  2%|▏         | 19/769 [02:02<1:07:39,  5.41s/it]                                                  {'loss': 1.2319, 'grad_norm': 3.4375, 'learning_rate': 4.736842105263158e-06, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1602.68, 'epoch': 0.02}
  2%|▏         | 19/769 [02:02<1:07:39,  5.41s/it]  3%|▎         | 20/769 [02:07<1:08:44,  5.51s/it]                                                  {'loss': 1.0638, 'grad_norm': 2.953125, 'learning_rate': 5e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.13, 'tokens_per_second_per_gpu': 1755.41, 'epoch': 0.03}
  3%|▎         | 20/769 [02:07<1:08:44,  5.51s/it]  3%|▎         | 21/769 [02:13<1:09:22,  5.56s/it]                                                  {'loss': 1.2072, 'grad_norm': 2.859375, 'learning_rate': 5.263157894736842e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1833.74, 'epoch': 0.03}
  3%|▎         | 21/769 [02:13<1:09:22,  5.56s/it]  3%|▎         | 22/769 [02:18<1:07:37,  5.43s/it]                                                  {'loss': 1.2006, 'grad_norm': 3.109375, 'learning_rate': 5.526315789473685e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1567.66, 'epoch': 0.03}
  3%|▎         | 22/769 [02:18<1:07:37,  5.43s/it]  3%|▎         | 23/769 [02:25<1:12:34,  5.84s/it]                                                  {'loss': 1.0081, 'grad_norm': 2.625, 'learning_rate': 5.789473684210527e-06, 'memory/max_active (GiB)': 49.73, 'memory/max_allocated (GiB)': 49.73, 'memory/device_reserved (GiB)': 78.38, 'tokens_per_second_per_gpu': 1412.25, 'epoch': 0.03}
  3%|▎         | 23/769 [02:25<1:12:34,  5.84s/it]  3%|▎         | 24/769 [02:31<1:12:04,  5.80s/it]                                                  {'loss': 1.1437, 'grad_norm': 2.6875, 'learning_rate': 6.0526315789473685e-06, 'memory/max_active (GiB)': 46.78, 'memory/max_allocated (GiB)': 46.78, 'memory/device_reserved (GiB)': 78.26, 'tokens_per_second_per_gpu': 1749.27, 'epoch': 0.03}
  3%|▎         | 24/769 [02:31<1:12:04,  5.80s/it]  3%|▎         | 25/769 [02:36<1:11:33,  5.77s/it]                                                  {'loss': 1.0987, 'grad_norm': 2.765625, 'learning_rate': 6.31578947368421e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1627.64, 'epoch': 0.03}
  3%|▎         | 25/769 [02:36<1:11:33,  5.77s/it]  3%|▎         | 26/769 [02:42<1:09:06,  5.58s/it]                                                  {'loss': 1.1046, 'grad_norm': 2.6875, 'learning_rate': 6.578947368421054e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1967.95, 'epoch': 0.03}
  3%|▎         | 26/769 [02:42<1:09:06,  5.58s/it]  4%|▎         | 27/769 [02:47<1:09:27,  5.62s/it]                                                  {'loss': 1.1072, 'grad_norm': 2.8125, 'learning_rate': 6.842105263157896e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1594.62, 'epoch': 0.04}
  4%|▎         | 27/769 [02:47<1:09:27,  5.62s/it]  4%|▎         | 28/769 [02:52<1:05:29,  5.30s/it]                                                  {'loss': 1.1974, 'grad_norm': 3.03125, 'learning_rate': 7.1052631578947375e-06, 'memory/max_active (GiB)': 41.47, 'memory/max_allocated (GiB)': 41.47, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1766.49, 'epoch': 0.04}
  4%|▎         | 28/769 [02:52<1:05:29,  5.30s/it]  4%|▍         | 29/769 [02:58<1:06:52,  5.42s/it]                                                  {'loss': 1.2164, 'grad_norm': 2.734375, 'learning_rate': 7.368421052631579e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1593.41, 'epoch': 0.04}
  4%|▍         | 29/769 [02:58<1:06:52,  5.42s/it]  4%|▍         | 30/769 [03:03<1:07:46,  5.50s/it]                                                  {'loss': 1.1324, 'grad_norm': 2.65625, 'learning_rate': 7.631578947368423e-06, 'memory/max_active (GiB)': 46.76, 'memory/max_allocated (GiB)': 46.76, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1607.66, 'epoch': 0.04}
  4%|▍         | 30/769 [03:03<1:07:46,  5.50s/it]  4%|▍         | 31/769 [03:09<1:08:21,  5.56s/it]                                                  {'loss': 1.0693, 'grad_norm': 2.96875, 'learning_rate': 7.894736842105265e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1454.28, 'epoch': 0.04}
  4%|▍         | 31/769 [03:09<1:08:21,  5.56s/it]  4%|▍         | 32/769 [03:14<1:06:43,  5.43s/it]                                                  {'loss': 1.0988, 'grad_norm': 2.65625, 'learning_rate': 8.157894736842106e-06, 'memory/max_active (GiB)': 43.82, 'memory/max_allocated (GiB)': 43.82, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1972.66, 'epoch': 0.04}
  4%|▍         | 32/769 [03:14<1:06:43,  5.43s/it]  4%|▍         | 33/769 [03:19<1:03:27,  5.17s/it]                                                  {'loss': 1.074, 'grad_norm': 2.84375, 'learning_rate': 8.421052631578948e-06, 'memory/max_active (GiB)': 41.48, 'memory/max_allocated (GiB)': 41.48, 'memory/device_reserved (GiB)': 77.13, 'tokens_per_second_per_gpu': 1705.8, 'epoch': 0.04}
  4%|▍         | 33/769 [03:19<1:03:27,  5.17s/it]  4%|▍         | 34/769 [03:24<1:03:14,  5.16s/it]                                                  {'loss': 1.1673, 'grad_norm': 2.671875, 'learning_rate': 8.68421052631579e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 76.88, 'tokens_per_second_per_gpu': 1761.02, 'epoch': 0.04}
  4%|▍         | 34/769 [03:24<1:03:14,  5.16s/it]  5%|▍         | 35/769 [03:29<1:02:59,  5.15s/it]                                                  {'loss': 1.1655, 'grad_norm': 3.03125, 'learning_rate': 8.947368421052632e-06, 'memory/max_active (GiB)': 43.79, 'memory/max_allocated (GiB)': 43.79, 'memory/device_reserved (GiB)': 76.01, 'tokens_per_second_per_gpu': 1479.42, 'epoch': 0.05}
  5%|▍         | 35/769 [03:29<1:02:59,  5.15s/it]  5%|▍         | 36/769 [03:35<1:04:53,  5.31s/it]                                                  {'loss': 1.1483, 'grad_norm': 2.71875, 'learning_rate': 9.210526315789474e-06, 'memory/max_active (GiB)': 46.75, 'memory/max_allocated (GiB)': 46.75, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1466.59, 'epoch': 0.05}
  5%|▍         | 36/769 [03:35<1:04:53,  5.31s/it]  5%|▍         | 37/769 [03:40<1:04:09,  5.26s/it]                                                  {'loss': 1.159, 'grad_norm': 2.578125, 'learning_rate': 9.473684210526315e-06, 'memory/max_active (GiB)': 43.81, 'memory/max_allocated (GiB)': 43.81, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1806.68, 'epoch': 0.05}
  5%|▍         | 37/769 [03:40<1:04:09,  5.26s/it]  5%|▍         | 38/769 [03:45<1:05:41,  5.39s/it]                                                  {'loss': 1.1035, 'grad_norm': 2.625, 'learning_rate': 9.736842105263159e-06, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1689.12, 'epoch': 0.05}
  5%|▍         | 38/769 [03:45<1:05:41,  5.39s/it]  5%|▌         | 39/769 [03:51<1:06:41,  5.48s/it]                                                  {'loss': 1.0471, 'grad_norm': 2.609375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1579.8, 'epoch': 0.05}
  5%|▌         | 39/769 [03:51<1:06:41,  5.48s/it]  5%|▌         | 40/769 [03:56<1:05:20,  5.38s/it]                                                  {'loss': 1.2091, 'grad_norm': 2.96875, 'learning_rate': 1.0263157894736844e-05, 'memory/max_active (GiB)': 43.8, 'memory/max_allocated (GiB)': 43.8, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1542.1, 'epoch': 0.05}
  5%|▌         | 40/769 [03:56<1:05:20,  5.38s/it]  5%|▌         | 41/769 [04:02<1:06:25,  5.47s/it]                                                  {'loss': 1.0722, 'grad_norm': 2.59375, 'learning_rate': 1.0526315789473684e-05, 'memory/max_active (GiB)': 46.77, 'memory/max_allocated (GiB)': 46.77, 'memory/device_reserved (GiB)': 77.38, 'tokens_per_second_per_gpu': 1585.53, 'epoch': 0.05}
  5%|▌         | 41/769 [04:02<1:06:25,  5.47s/it]