[2026-01-04 00:10:03,594] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:15692] bf16 support detected, enabling for this configuration.
[2026-01-04 00:10:03,667] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:15692] baseline 0.000GB ()
[2026-01-04 00:10:03,668] [INFO] [axolotl.cli.config.load_cfg:259] [PID:15692] config:
{
  "activation_offloading": true,
  "adapter": "lora",
  "axolotl_config_path": "train.yml",
  "base_model": "shb777/Llama-3.3-8B-Instruct-128K",
  "base_model_config": "shb777/Llama-3.3-8B-Instruct-128K",
  "batch_size": 4,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_86",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "chat_template": "llama3",
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 96,
  "dataset_prepared_path": "dataset_prepareds",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "WokeAI/polititune-tankie-warmup",
      "split": "train",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": false,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 2,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_llama_derived_model": true,
  "learning_rate": 1e-05,
  "liger_fused_linear_cross_entropy": false,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 16,
  "lora_dropout": 0.35,
  "lora_r": 64,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "rex",
  "max_grad_norm": 0.1,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "llama",
  "num_epochs": 2.0,
  "optimizer": "adamw_torch_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./output",
  "pad_to_sequence_len": true,
  "peft_use_rslora": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin",
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "remove_unused_columns": false,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "pad_token": "<|reserved_special_token_2|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "shb777/Llama-3.3-8B-Instruct-128K",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_project": "newyear",
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-01-04 00:10:03,672] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:15692] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
[2026-01-04 00:10:04,197] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <|eot_id|>
[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <|begin_of_text|>
[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <|reserved_special_token_2|>
[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None
[2026-01-04 00:10:04,199] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:15692] Loading prepared dataset from disk at dataset_prepareds/a420619428aa6c5576289a496238883a...
[2026-01-04 00:10:04,213] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:15692] total_num_tokens: 684_427
[2026-01-04 00:10:04,225] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:15692] `total_supervised_tokens: 498_319`
[2026-01-04 00:10:04,242] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
[2026-01-04 00:10:05,245] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
[2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.21780610084533691
[2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
[2026-01-04 00:10:05,685] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.22262859344482422
[2026-01-04 00:10:05,686] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
[2026-01-04 00:10:05,930] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.2447659969329834
[2026-01-04 00:10:05,931] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
[2026-01-04 00:10:06,188] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.25751280784606934
[2026-01-04 00:10:06,211] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84]
[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:494] [PID:15692] data_loader_len: 42
[2026-01-04 00:10:06,212] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:510] [PID:15692] sample_packing_eff_est across ranks: [0.9946216401599702]
[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:15692] sample_packing_eff_est: 1.0
[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:15692] total_num_steps: 84
[2026-01-04 00:10:06,212] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:15692] Maximum number of steps set at 84
[2026-01-04 00:10:06,240] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:15692] loading tokenizer... shb777/Llama-3.3-8B-Instruct-128K
[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <|eot_id|>
[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <|begin_of_text|>
[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <|reserved_special_token_2|>
[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None
[2026-01-04 00:10:06,719] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:15692] Loading model
[2026-01-04 00:10:06,766] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:15692] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-04 00:10:06,767] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:15692] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-04 00:10:06,768] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:344] [PID:15692] Applying multipack dataloader patch for sample packing...
[2026-01-04 00:10:06,873] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:15692] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': False, 'rms_norm': True, 'swiglu': True}
[2026-01-04 00:10:07,074] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:15692] Applying Cut Cross Entropy to model type: llama
Loading checkpoint shards:   0%|                                                                                                                                                          | 0/4 [00:00<?, ?it/s]Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 133.41it/s]
[2026-01-04 00:10:09,611] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:15692] Converting modules to torch.bfloat16
[2026-01-04 00:10:10,928] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] Memory usage after model load 0.000GB ()
[2026-01-04 00:10:10,929] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:15692] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465
[2026-01-04 00:10:12,458] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] after adapters 0.000GB ()
[2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:413] [PID:15692] Pre-saving adapter config to ./output...
[2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:417] [PID:15692] Pre-saving tokenizer to ./output...
[2026-01-04 00:10:20,929] [INFO] [axolotl.train.save_initial_configs:422] [PID:15692] Pre-saving model config to ./output...
[2026-01-04 00:10:20,931] [INFO] [axolotl.train.execute_training:212] [PID:15692] Starting trainer...
[2026-01-04 00:10:22,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.49930906295776367
[2026-01-04 00:10:22,811] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.47399091720581055
[2026-01-04 00:10:23,297] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.4854564666748047
[2026-01-04 00:10:23,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.5046358108520508
[2026-01-04 00:10:23,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84]
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.23.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20260104_001243-myor4kbd[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdistinctive-firebrand-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fizzzz/newyear[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/fizzzz/newyear/runs/myor4kbd[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-01-04 00:12:45,380] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:15692] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                                                                                    | 0/84 [00:00<?, ?it/s]  1%|██                                                                                                                                                                          | 1/84 [00:36<50:25, 36.45s/it]                                                                                                                                                                                                                {'loss': 3.0634, 'grad_norm': 8.193868637084961, 'learning_rate': 4.999999873689376e-06, 'ppl': 21.40019, 'memory/max_active (GiB)': 20.05, 'memory/max_allocated (GiB)': 20.05, 'memory/device_reserved (GiB)': 20.62, 'tokens/train_per_sec_per_gpu': 155.38671875, 'epoch': 0.02, 'tokens/total': 16384.0, 'tokens/trainable': 9972.0}
  1%|██                                                                                                                                                                          | 1/84 [00:36<50:25, 36.45s/it]  2%|████                                                                                                                                                                        | 2/84 [00:53<33:55, 24.83s/it]                                                                                                                                                                                                                {'loss': 3.2197, 'grad_norm': 8.957365036010742, 'learning_rate': 9.999999747378752e-06, 'ppl': 25.02061, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.190185546875, 'epoch': 0.05, 'tokens/total': 32768.0, 'tokens/trainable': 21140.0}
  2%|████                                                                                                                                                                        | 2/84 [00:53<33:55, 24.83s/it]  4%|██████▏                                                                                                                                                                     | 3/84 [01:08<27:24, 20.31s/it]                                                                                                                                                                                                                {'loss': 3.0878, 'grad_norm': 7.201009273529053, 'learning_rate': 9.987669727706816e-06, 'ppl': 21.92878, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 446.0367736816406, 'epoch': 0.07, 'tokens/total': 49152.0, 'tokens/trainable': 33455.0}
  4%|██████▏                                                                                                                                                                     | 3/84 [01:08<27:24, 20.31s/it]  5%|████████▏                                                                                                                                                                   | 4/84 [01:22<24:13, 18.17s/it]                                                                                                                                                                                                                {'loss': 3.0947, 'grad_norm': 6.2549729347229, 'learning_rate': 9.97506231215084e-06, 'ppl': 22.08061, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 355.3751220703125, 'epoch': 0.1, 'tokens/total': 65536.0, 'tokens/trainable': 44394.0}
  5%|████████▏                                                                                                                                                                   | 4/84 [01:22<24:13, 18.17s/it]  6%|██████████▏                                                                                                                                                                 | 5/84 [01:39<23:21, 17.74s/it]                                                                                                                                                                                                                {'loss': 2.6876, 'grad_norm': 5.082422733306885, 'learning_rate': 9.962168405763805e-06, 'ppl': 14.69636, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 351.228515625, 'epoch': 0.12, 'tokens/total': 81920.0, 'tokens/trainable': 56961.0}
  6%|██████████▏                                                                                                                                                                 | 5/84 [01:39<23:21, 17.74s/it]  7%|████████████▎                                                                                                                                                               | 6/84 [01:54<21:49, 16.78s/it]                                                                                                                                                                                                                {'loss': 3.0661, 'grad_norm': 4.808957099914551, 'learning_rate': 9.948978913598694e-06, 'ppl': 21.45805, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 418.3861083984375, 'epoch': 0.14, 'tokens/total': 98304.0, 'tokens/trainable': 69788.0}
  7%|████████████▎                                                                                                                                                               | 6/84 [01:54<21:49, 16.78s/it]  8%|██████████████▎                                                                                                                                                             | 7/84 [02:09<20:46, 16.19s/it]                                                                                                                                                                                                                {'loss': 2.7881, 'grad_norm': 3.901789426803589, 'learning_rate': 9.935483831213787e-06, 'ppl': 16.25012, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.1433410644531, 'epoch': 0.17, 'tokens/total': 114688.0, 'tokens/trainable': 81854.0}
  8%|██████████████▎                                                                                                                                                             | 7/84 [02:09<20:46, 16.19s/it] 10%|████████████████▍                                                                                                                                                           | 8/84 [02:24<20:00, 15.80s/it]                                                                                                                                                                                                                {'loss': 2.556, 'grad_norm': 3.0737860202789307, 'learning_rate': 9.921671335177962e-06, 'ppl': 12.88418, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 393.31671142578125, 'epoch': 0.19, 'tokens/total': 131072.0, 'tokens/trainable': 94072.0}
 10%|████████████████▍                                                                                                                                                           | 8/84 [02:24<20:00, 15.80s/it] 11%|██████████████████▍                                                                                                                                                         | 9/84 [02:39<19:25, 15.54s/it]                                                                                                                                                                                                                {'loss': 2.8264, 'grad_norm': 2.6989502906799316, 'learning_rate': 9.907529602060094e-06, 'ppl': 16.88457, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.95263671875, 'epoch': 0.21, 'tokens/total': 147456.0, 'tokens/trainable': 106403.0}
 11%|██████████████████▍                                                                                                                                                         | 9/84 [02:39<19:25, 15.54s/it] 12%|████████████████████▎                                                                                                                                                      | 10/84 [02:54<18:59, 15.39s/it]                                                                                                                                                                                                                {'loss': 2.7276, 'grad_norm': 1.9797707796096802, 'learning_rate': 9.893047717923764e-06, 'ppl': 15.29613, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.0211181640625, 'epoch': 0.24, 'tokens/total': 163840.0, 'tokens/trainable': 119564.0}
 12%|████████████████████▎                                                                                                                                                      | 10/84 [02:54<18:59, 15.39s/it] 13%|██████████████████████▍                                                                                                                                                    | 11/84 [03:09<18:36, 15.29s/it]                                                                                                                                                                                                                {'loss': 2.7648, 'grad_norm': 1.7535356283187866, 'learning_rate': 9.878213859337848e-06, 'ppl': 15.87586, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.7185974121094, 'epoch': 0.26, 'tokens/total': 180224.0, 'tokens/trainable': 130051.0}
 13%|██████████████████████▍                                                                                                                                                    | 11/84 [03:09<18:36, 15.29s/it] 14%|████████████████████████▍                                                                                                                                                  | 12/84 [03:24<18:16, 15.23s/it]                                                                                                                                                                                                                {'loss': 2.7597, 'grad_norm': 1.589630126953125, 'learning_rate': 9.863013474387117e-06, 'ppl': 15.7951, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 436.2579345703125, 'epoch': 0.29, 'tokens/total': 196608.0, 'tokens/trainable': 141402.0}
 14%|████████████████████████▍                                                                                                                                                  | 12/84 [03:24<18:16, 15.23s/it] 15%|██████████████████████████▍                                                                                                                                                | 13/84 [03:39<17:56, 15.16s/it]                                                                                                                                                                                                                {'loss': 2.371, 'grad_norm': 1.2782939672470093, 'learning_rate': 9.847433830145746e-06, 'ppl': 10.7081, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 380.26220703125, 'epoch': 0.31, 'tokens/total': 212992.0, 'tokens/trainable': 152557.0}
 15%|██████████████████████████▍                                                                                                                                                | 13/84 [03:39<17:56, 15.16s/it] 17%|████████████████████████████▌                                                                                                                                              | 14/84 [03:55<17:39, 15.13s/it]                                                                                                                                                                                                                {'loss': 2.3513, 'grad_norm': 1.4143847227096558, 'learning_rate': 9.831460374698509e-06, 'ppl': 10.49921, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 394.7943115234375, 'epoch': 0.33, 'tokens/total': 229376.0, 'tokens/trainable': 165183.0}
 17%|████████████████████████████▌                                                                                                                                              | 14/84 [03:55<17:39, 15.13s/it] 18%|██████████████████████████████▌                                                                                                                                            | 15/84 [04:10<17:23, 15.13s/it]                                                                                                                                                                                                                {'loss': 2.7007, 'grad_norm': 1.272851586341858, 'learning_rate': 9.815078556130175e-06, 'ppl': 14.89015, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.2015380859375, 'epoch': 0.36, 'tokens/total': 245760.0, 'tokens/trainable': 176902.0}
 18%|██████████████████████████████▌                                                                                                                                            | 15/84 [04:10<17:23, 15.13s/it] 19%|████████████████████████████████▌                                                                                                                                          | 16/84 [04:25<17:07, 15.11s/it]                                                                                                                                                                                                                {'loss': 2.6622, 'grad_norm': 1.1848429441452026, 'learning_rate': 9.79827109404141e-06, 'ppl': 14.32778, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.4239501953125, 'epoch': 0.38, 'tokens/total': 262144.0, 'tokens/trainable': 189732.0}
 19%|████████████████████████████████▌                                                                                                                                          | 16/84 [04:25<17:07, 15.11s/it] 20%|██████████████████████████████████▌                                                                                                                                        | 17/84 [04:40<16:50, 15.08s/it]                                                                                                                                                                                                                {'loss': 2.7354, 'grad_norm': 1.4691566228866577, 'learning_rate': 9.781021617527585e-06, 'ppl': 15.41591, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 288.30322265625, 'epoch': 0.4, 'tokens/total': 278528.0, 'tokens/trainable': 199657.0}
 20%|██████████████████████████████████▌                                                                                                                                        | 17/84 [04:40<16:50, 15.08s/it] 21%|████████████████████████████████████▋                                                                                                                                      | 18/84 [04:55<16:33, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.5652, 'grad_norm': 1.199852466583252, 'learning_rate': 9.763312846189365e-06, 'ppl': 13.00326, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 437.531982421875, 'epoch': 0.43, 'tokens/total': 294912.0, 'tokens/trainable': 211618.0}
 21%|████████████████████████████████████▋                                                                                                                                      | 18/84 [04:55<16:33, 15.06s/it] 23%|██████████████████████████████████████▋                                                                                                                                    | 19/84 [05:10<16:18, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.4957, 'grad_norm': 1.1276423931121826, 'learning_rate': 9.745127499627415e-06, 'ppl': 12.13022, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.6016540527344, 'epoch': 0.45, 'tokens/total': 311296.0, 'tokens/trainable': 223722.0}
 23%|██████████████████████████████████████▋                                                                                                                                    | 19/84 [05:10<16:18, 15.05s/it] 24%|████████████████████████████████████████▋                                                                                                                                  | 20/84 [05:25<16:03, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.5416, 'grad_norm': 1.1429506540298462, 'learning_rate': 9.726443749968894e-06, 'ppl': 12.69997, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 474.4263000488281, 'epoch': 0.48, 'tokens/total': 327680.0, 'tokens/trainable': 236639.0}
 24%|████████████████████████████████████████▋                                                                                                                                  | 20/84 [05:25<16:03, 15.05s/it] 25%|██████████████████████████████████████████▊                                                                                                                                | 21/84 [05:40<15:48, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.437, 'grad_norm': 0.9757459759712219, 'learning_rate': 9.707241588330362e-06, 'ppl': 11.43867, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.1600341796875, 'epoch': 0.5, 'tokens/total': 344064.0, 'tokens/trainable': 248033.0}
 25%|██████████████████████████████████████████▊                                                                                                                                | 21/84 [05:40<15:48, 15.05s/it] 26%|████████████████████████████████████████████▊                                                                                                                              | 22/84 [05:55<15:34, 15.08s/it]                                                                                                                                                                                                                {'loss': 2.2622, 'grad_norm': 0.9250922799110413, 'learning_rate': 9.687500096333679e-06, 'ppl': 9.6042, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 341.8017578125, 'epoch': 0.52, 'tokens/total': 360448.0, 'tokens/trainable': 259928.0}
 26%|████████████████████████████████████████████▊                                                                                                                              | 22/84 [05:55<15:34, 15.08s/it] 27%|██████████████████████████████████████████████▊                                                                                                                            | 23/84 [06:10<15:20, 15.10s/it]                                                                                                                                                                                                                {'loss': 2.6973, 'grad_norm': 1.0023396015167236, 'learning_rate': 9.667194717621896e-06, 'ppl': 14.83961, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 456.94940185546875, 'epoch': 0.55, 'tokens/total': 376832.0, 'tokens/trainable': 273229.0}
 27%|██████████████████████████████████████████████▊                                                                                                                            | 23/84 [06:10<15:20, 15.10s/it] 29%|████████████████████████████████████████████████▊                                                                                                                          | 24/84 [06:25<15:05, 15.09s/it]                                                                                                                                                                                                                {'loss': 2.4554, 'grad_norm': 0.8203016519546509, 'learning_rate': 9.646301805332769e-06, 'ppl': 11.65109, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 377.00543212890625, 'epoch': 0.57, 'tokens/total': 393216.0, 'tokens/trainable': 285582.0}
 29%|████████████████████████████████████████████████▊                                                                                                                          | 24/84 [06:25<15:05, 15.09s/it] 30%|██████████████████████████████████████████████████▉                                                                                                                        | 25/84 [06:40<14:48, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.4833, 'grad_norm': 0.9041063785552979, 'learning_rate': 9.624795893614646e-06, 'ppl': 11.98074, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 307.10601806640625, 'epoch': 0.6, 'tokens/total': 409600.0, 'tokens/trainable': 296666.0}
 30%|██████████████████████████████████████████████████▉                                                                                                                        | 25/84 [06:40<14:48, 15.06s/it] 31%|████████████████████████████████████████████████████▉                                                                                                                      | 26/84 [06:55<14:34, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.5628, 'grad_norm': 0.9094843864440918, 'learning_rate': 9.602648788131773e-06, 'ppl': 12.97209, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 424.59381103515625, 'epoch': 0.62, 'tokens/total': 425984.0, 'tokens/trainable': 309496.0}
 31%|████████████████████████████████████████████████████▉                                                                                                                      | 26/84 [06:55<14:34, 15.07s/it] 32%|██████████████████████████████████████████████████████▉                                                                                                                    | 27/84 [07:10<14:19, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.5322, 'grad_norm': 0.9065321683883667, 'learning_rate': 9.579831385053694e-06, 'ppl': 12.58115, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.19561767578125, 'epoch': 0.64, 'tokens/total': 442368.0, 'tokens/trainable': 323050.0}
 32%|██████████████████████████████████████████████████████▉                                                                                                                    | 27/84 [07:10<14:19, 15.07s/it] 33%|█████████████████████████████████████████████████████████                                                                                                                  | 28/84 [07:25<14:03, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.4975, 'grad_norm': 0.9096329212188721, 'learning_rate': 9.55631367105525e-06, 'ppl': 12.15208, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 276.995361328125, 'epoch': 0.67, 'tokens/total': 458752.0, 'tokens/trainable': 332229.0}
 33%|█████████████████████████████████████████████████████████                                                                                                                  | 28/84 [07:25<14:03, 15.06s/it] 35%|███████████████████████████████████████████████████████████                                                                                                                | 29/84 [07:41<13:48, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.2896, 'grad_norm': 0.811943531036377, 'learning_rate': 9.532061994832475e-06, 'ppl': 9.87099, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.5038146972656, 'epoch': 0.69, 'tokens/total': 475136.0, 'tokens/trainable': 342910.0}
 35%|███████████████████████████████████████████████████████████                                                                                                                | 29/84 [07:41<13:48, 15.07s/it] 36%|█████████████████████████████████████████████████████████████                                                                                                              | 30/84 [07:56<13:32, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.1711, 'grad_norm': 0.7714135646820068, 'learning_rate': 9.507041795586701e-06, 'ppl': 8.76792, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.57464599609375, 'epoch': 0.71, 'tokens/total': 491520.0, 'tokens/trainable': 354990.0}
 36%|█████████████████████████████████████████████████████████████                                                                                                              | 30/84 [07:56<13:32, 15.05s/it] 37%|███████████████████████████████████████████████████████████████                                                                                                            | 31/84 [08:11<13:18, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.5228, 'grad_norm': 0.7664998769760132, 'learning_rate': 9.48121669352986e-06, 'ppl': 12.46345, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.0357666015625, 'epoch': 0.74, 'tokens/total': 507904.0, 'tokens/trainable': 368404.0}
 37%|███████████████████████████████████████████████████████████████                                                                                                            | 31/84 [08:11<13:18, 15.06s/it] 38%|█████████████████████████████████████████████████████████████████▏                                                                                                         | 32/84 [08:26<13:03, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.4413, 'grad_norm': 0.7703967094421387, 'learning_rate': 9.454544851905666e-06, 'ppl': 11.48797, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.70343017578125, 'epoch': 0.76, 'tokens/total': 524288.0, 'tokens/trainable': 381274.0}
 38%|█████████████████████████████████████████████████████████████████▏                                                                                                         | 32/84 [08:26<13:03, 15.06s/it] 39%|███████████████████████████████████████████████████████████████████▏                                                                                                       | 33/84 [08:41<12:47, 15.04s/it]                                                                                                                                                                                                                {'loss': 2.3928, 'grad_norm': 0.8420023918151855, 'learning_rate': 9.426987162441947e-06, 'ppl': 10.94409, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 350.9320983886719, 'epoch': 0.79, 'tokens/total': 540672.0, 'tokens/trainable': 392843.0}
 39%|███████████████████████████████████████████████████████████████████▏                                                                                                       | 33/84 [08:41<12:47, 15.04s/it] 40%|█████████████████████████████████████████████████████████████████████▏                                                                                                     | 34/84 [08:56<12:33, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.4511, 'grad_norm': 0.7319009900093079, 'learning_rate': 9.398496331414208e-06, 'ppl': 11.6011, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 461.1392822265625, 'epoch': 0.81, 'tokens/total': 557056.0, 'tokens/trainable': 406127.0}
 40%|█████████████████████████████████████████████████████████████████████▏                                                                                                     | 34/84 [08:56<12:33, 15.07s/it] 42%|███████████████████████████████████████████████████████████████████████▎                                                                                                   | 35/84 [09:11<12:18, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.6036, 'grad_norm': 0.845214307308197, 'learning_rate': 9.369024155603256e-06, 'ppl': 13.51229, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.6831359863281, 'epoch': 0.83, 'tokens/total': 573440.0, 'tokens/trainable': 416879.0}
 42%|███████████████████████████████████████████████████████████████████████▎                                                                                                   | 35/84 [09:11<12:18, 15.07s/it] 43%|█████████████████████████████████████████████████████████████████████████▎                                                                                                 | 36/84 [09:26<12:02, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.3314, 'grad_norm': 0.7980889678001404, 'learning_rate': 9.338521522295196e-06, 'ppl': 10.29234, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 359.2618103027344, 'epoch': 0.86, 'tokens/total': 589824.0, 'tokens/trainable': 428113.0}
 43%|█████████████████████████████████████████████████████████████████████████▎                                                                                                 | 36/84 [09:26<12:02, 15.06s/it] 44%|███████████████████████████████████████████████████████████████████████████▎                                                                                               | 37/84 [09:41<11:49, 15.09s/it]                                                                                                                                                                                                                {'loss': 2.4644, 'grad_norm': 0.7808263897895813, 'learning_rate': 9.306930223829113e-06, 'ppl': 11.75643, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.8631591796875, 'epoch': 0.88, 'tokens/total': 606208.0, 'tokens/trainable': 441366.0}
 44%|███████████████████████████████████████████████████████████████████████████▎                                                                                               | 37/84 [09:41<11:49, 15.09s/it] 45%|█████████████████████████████████████████████████████████████████████████████▎                                                                                             | 38/84 [09:56<11:33, 15.07s/it]                                                                                                                                                                                                                {'loss': 1.9823, 'grad_norm': 0.7255628705024719, 'learning_rate': 9.274192962038796e-06, 'ppl': 7.25942, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.92559814453125, 'epoch': 0.9, 'tokens/total': 622592.0, 'tokens/trainable': 453641.0}
 45%|█████████████████████████████████████████████████████████████████████████████▎                                                                                             | 38/84 [09:56<11:33, 15.07s/it] 46%|███████████████████████████████████████████████████████████████████████████████▍                                                                                           | 39/84 [10:11<11:17, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.3952, 'grad_norm': 0.8048214316368103, 'learning_rate': 9.240246072295122e-06, 'ppl': 10.97039, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 308.4759521484375, 'epoch': 0.93, 'tokens/total': 638976.0, 'tokens/trainable': 465009.0}
 46%|███████████████████████████████████████████████████████████████████████████████▍                                                                                           | 39/84 [10:11<11:17, 15.05s/it] 48%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 40/84 [10:26<11:02, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.119, 'grad_norm': 0.834456741809845, 'learning_rate': 9.205020433000755e-06, 'ppl': 8.32281, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 238.43943786621094, 'epoch': 0.95, 'tokens/total': 655360.0, 'tokens/trainable': 474320.0}
 48%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 40/84 [10:26<11:02, 15.05s/it] 49%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 41/84 [10:41<10:47, 15.05s/it]                                                                                                                                                                                                                {'loss': 2.1589, 'grad_norm': 0.7231636047363281, 'learning_rate': 9.168443284579553e-06, 'ppl': 8.6616, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.91583251953125, 'epoch': 0.98, 'tokens/total': 671744.0, 'tokens/trainable': 486804.0}
 49%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 41/84 [10:41<10:47, 15.05s/it] 50%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 42/84 [10:56<10:32, 15.07s/it]                                                                                                                                                                                                                {'loss': 2.2662, 'grad_norm': 0.7623542547225952, 'learning_rate': 9.13043459149776e-06, 'ppl': 9.64269, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 316.7720031738281, 'epoch': 1.0, 'tokens/total': 688128.0, 'tokens/trainable': 498319.0}
 50%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 42/84 [10:56<10:32, 15.07s/it][2026-01-04 00:23:42,204] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-42
 51%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 43/84 [11:15<11:01, 16.15s/it]                                                                                                                                                                                                                {'loss': 2.364, 'grad_norm': 0.820637583732605, 'learning_rate': 9.09090886125341e-06, 'ppl': 10.6334, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 388.0730285644531, 'epoch': 1.02, 'tokens/total': 704512.0, 'tokens/trainable': 509948.0}
 51%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 43/84 [11:15<11:01, 16.15s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 44/84 [11:30<10:33, 15.83s/it]                                                                                                                                                                                                                {'loss': 2.2541, 'grad_norm': 0.7583165168762207, 'learning_rate': 9.049773325386923e-06, 'ppl': 9.52672, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 406.7909851074219, 'epoch': 1.05, 'tokens/total': 720896.0, 'tokens/trainable': 521725.0}
 52%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 44/84 [11:30<10:33, 15.83s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 45/84 [11:45<10:07, 15.58s/it]                                                                                                                                                                                                                {'loss': 2.184, 'grad_norm': 0.7426517009735107, 'learning_rate': 9.006927939481102e-06, 'ppl': 8.88176, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.7640380859375, 'epoch': 1.07, 'tokens/total': 737280.0, 'tokens/trainable': 533462.0}
 54%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 45/84 [11:45<10:07, 15.58s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 46/84 [12:00<09:45, 15.42s/it]                                                                                                                                                                                                                {'loss': 2.2603, 'grad_norm': 0.7592601776123047, 'learning_rate': 8.962263564171735e-06, 'ppl': 9.58596, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.5147399902344, 'epoch': 1.1, 'tokens/total': 753664.0, 'tokens/trainable': 544963.0}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 46/84 [12:00<09:45, 15.42s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 47/84 [12:15<09:27, 15.33s/it]                                                                                                                                                                                                                {'loss': 2.3293, 'grad_norm': 0.712399423122406, 'learning_rate': 8.915662874642294e-06, 'ppl': 10.27075, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.820068359375, 'epoch': 1.12, 'tokens/total': 770048.0, 'tokens/trainable': 557568.0}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 47/84 [12:15<09:27, 15.33s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 48/84 [12:30<09:10, 15.28s/it]                                                                                                                                                                                                                {'loss': 2.3301, 'grad_norm': 0.6970159411430359, 'learning_rate': 8.866994903655723e-06, 'ppl': 10.27897, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 479.2054443359375, 'epoch': 1.14, 'tokens/total': 786432.0, 'tokens/trainable': 571098.0}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 48/84 [12:30<09:10, 15.28s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 49/84 [12:46<08:53, 15.24s/it]                                                                                                                                                                                                                {'loss': 2.1423, 'grad_norm': 0.8117857575416565, 'learning_rate': 8.81612049852265e-06, 'ppl': 8.51901, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 311.01019287109375, 'epoch': 1.17, 'tokens/total': 802816.0, 'tokens/trainable': 580687.0}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 49/84 [12:46<08:53, 15.24s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 50/84 [13:01<08:37, 15.21s/it]                                                                                                                                                                                                                {'loss': 2.287, 'grad_norm': 0.7534531950950623, 'learning_rate': 8.762885954638477e-06, 'ppl': 9.84536, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 460.1733703613281, 'epoch': 1.19, 'tokens/total': 819200.0, 'tokens/trainable': 593415.0}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 50/84 [13:01<08:37, 15.21s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 51/84 [13:16<08:21, 15.18s/it]                                                                                                                                                                                                                {'loss': 2.2903, 'grad_norm': 0.7555272579193115, 'learning_rate': 8.707123924978077e-06, 'ppl': 9.8779, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 357.5971374511719, 'epoch': 1.21, 'tokens/total': 835584.0, 'tokens/trainable': 605122.0}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 51/84 [13:16<08:21, 15.18s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 52/84 [13:31<08:04, 15.16s/it]                                                                                                                                                                                                                {'loss': 2.0339, 'grad_norm': 0.7037354707717896, 'learning_rate': 8.648648872622289e-06, 'ppl': 7.64384, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 445.2397155761719, 'epoch': 1.24, 'tokens/total': 851968.0, 'tokens/trainable': 617663.0}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 52/84 [13:31<08:04, 15.16s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 53/84 [13:46<07:49, 15.14s/it]                                                                                                                                                                                                                {'loss': 2.208, 'grad_norm': 0.7909743189811707, 'learning_rate': 8.587257980252616e-06, 'ppl': 9.0975, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 342.66082763671875, 'epoch': 1.26, 'tokens/total': 868352.0, 'tokens/trainable': 628294.0}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 53/84 [13:46<07:49, 15.14s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 54/84 [14:01<07:33, 15.12s/it]                                                                                                                                                                                                                {'loss': 2.0099, 'grad_norm': 0.9540871977806091, 'learning_rate': 8.522727512172423e-06, 'ppl': 7.46257, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.34002685546875, 'epoch': 1.29, 'tokens/total': 884736.0, 'tokens/trainable': 639131.0}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 54/84 [14:01<07:33, 15.12s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 55/84 [14:16<07:18, 15.11s/it]                                                                                                                                                                                                                {'loss': 2.1202, 'grad_norm': 0.7127824425697327, 'learning_rate': 8.454810085822828e-06, 'ppl': 8.3328, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 309.6125793457031, 'epoch': 1.31, 'tokens/total': 901120.0, 'tokens/trainable': 650860.0}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 55/84 [14:16<07:18, 15.11s/it] 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 56/84 [14:31<07:02, 15.08s/it]                                                                                                                                                                                                                {'loss': 2.2369, 'grad_norm': 0.8307490944862366, 'learning_rate': 8.383233762288e-06, 'ppl': 9.36426, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 360.55908203125, 'epoch': 1.33, 'tokens/total': 917504.0, 'tokens/trainable': 660335.0}
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 56/84 [14:31<07:02, 15.08s/it] 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 57/84 [14:46<06:47, 15.10s/it]                                                                                                                                                                                                                {'loss': 2.4058, 'grad_norm': 0.8058661818504333, 'learning_rate': 8.307692041853443e-06, 'ppl': 11.0873, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 369.5503845214844, 'epoch': 1.36, 'tokens/total': 933888.0, 'tokens/trainable': 672664.0}
 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 57/84 [14:46<06:47, 15.10s/it] 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 58/84 [15:01<06:32, 15.10s/it]                                                                                                                                                                                                                {'loss': 2.2731, 'grad_norm': 0.7699398994445801, 'learning_rate': 8.227847501984797e-06, 'ppl': 9.70945, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.2121887207031, 'epoch': 1.38, 'tokens/total': 950272.0, 'tokens/trainable': 685353.0}
 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 58/84 [15:01<06:32, 15.10s/it] 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 59/84 [15:17<06:17, 15.09s/it]                                                                                                                                                                                                                {'loss': 2.2848, 'grad_norm': 0.7645865678787231, 'learning_rate': 8.143322702380829e-06, 'ppl': 9.82372, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 433.5550842285156, 'epoch': 1.4, 'tokens/total': 966656.0, 'tokens/trainable': 697377.0}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 59/84 [15:17<06:17, 15.09s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 60/84 [15:32<06:01, 15.08s/it]                                                                                                                                                                                                                {'loss': 2.2004, 'grad_norm': 0.7868902087211609, 'learning_rate': 8.053691090026405e-06, 'ppl': 9.02862, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 322.94708251953125, 'epoch': 1.43, 'tokens/total': 983040.0, 'tokens/trainable': 708580.0}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 60/84 [15:32<06:01, 15.08s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 61/84 [15:47<05:47, 15.10s/it]                                                                                                                                                                                                                {'loss': 2.1096, 'grad_norm': 0.8960193991661072, 'learning_rate': 7.9584779086872e-06, 'ppl': 8.24494, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.5703125, 'epoch': 1.45, 'tokens/total': 999424.0, 'tokens/trainable': 721379.0}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 61/84 [15:47<05:47, 15.10s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 62/84 [16:02<05:31, 15.08s/it]                                                                                                                                                                                                                {'loss': 1.994, 'grad_norm': 0.7898715138435364, 'learning_rate': 7.857142918510363e-06, 'ppl': 7.34485, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 345.618408203125, 'epoch': 1.48, 'tokens/total': 1015808.0, 'tokens/trainable': 732230.0}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 62/84 [16:02<05:31, 15.08s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 63/84 [16:17<05:16, 15.09s/it]                                                                                                                                                                                                                {'loss': 2.1251, 'grad_norm': 0.7547686100006104, 'learning_rate': 7.749077667540405e-06, 'ppl': 8.37373, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.9949951171875, 'epoch': 1.5, 'tokens/total': 1032192.0, 'tokens/trainable': 743308.0}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 63/84 [16:17<05:16, 15.09s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 64/84 [16:32<05:01, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.2899, 'grad_norm': 0.8282785415649414, 'learning_rate': 7.633587301825173e-06, 'ppl': 9.87395, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 389.5559997558594, 'epoch': 1.52, 'tokens/total': 1048576.0, 'tokens/trainable': 754578.0}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 64/84 [16:32<05:01, 15.06s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 65/84 [16:47<04:46, 15.07s/it]                                                                                                                                                                                                                {'loss': 1.982, 'grad_norm': 0.7019599080085754, 'learning_rate': 7.509881015721476e-06, 'ppl': 7.25724, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 467.1014709472656, 'epoch': 1.55, 'tokens/total': 1064960.0, 'tokens/trainable': 767778.0}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 65/84 [16:47<04:46, 15.07s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 66/84 [17:02<04:31, 15.06s/it]                                                                                                                                                                                                                {'loss': 2.177, 'grad_norm': 0.8358809351921082, 'learning_rate': 7.3770493145275395e-06, 'ppl': 8.81981, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 329.0794982910156, 'epoch': 1.57, 'tokens/total': 1081344.0, 'tokens/trainable': 778410.0}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 66/84 [17:02<04:31, 15.06s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 67/84 [17:17<04:16, 15.07s/it]                                                                                                                                                                                                                {'loss': 1.9627, 'grad_norm': 0.7629905343055725, 'learning_rate': 7.234042186610168e-06, 'ppl': 7.11852, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 318.5070495605469, 'epoch': 1.6, 'tokens/total': 1097728.0, 'tokens/trainable': 790046.0}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 67/84 [17:17<04:16, 15.07s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 68/84 [17:32<04:01, 15.09s/it]                                                                                                                                                                                                                {'loss': 2.2829, 'grad_norm': 0.7605993151664734, 'learning_rate': 7.079645911289845e-06, 'ppl': 9.80507, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 432.19000244140625, 'epoch': 1.62, 'tokens/total': 1114112.0, 'tokens/trainable': 803491.0}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 68/84 [17:32<04:01, 15.09s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 69/84 [17:47<03:46, 15.11s/it]                                                                                                                                                                                                                {'loss': 2.1591, 'grad_norm': 0.7556662559509277, 'learning_rate': 6.912442131579155e-06, 'ppl': 8.66334, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 449.40435791015625, 'epoch': 1.64, 'tokens/total': 1130496.0, 'tokens/trainable': 816919.0}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 69/84 [17:47<03:46, 15.11s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 70/84 [18:02<03:31, 15.12s/it]                                                                                                                                                                                                                {'loss': 2.0868, 'grad_norm': 0.7916563749313354, 'learning_rate': 6.73076920065796e-06, 'ppl': 8.05908, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.95281982421875, 'epoch': 1.67, 'tokens/total': 1146880.0, 'tokens/trainable': 827854.0}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 70/84 [18:02<03:31, 15.12s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 71/84 [18:18<03:16, 15.14s/it]                                                                                                                                                                                                                {'loss': 2.046, 'grad_norm': 0.7467530965805054, 'learning_rate': 6.532663064717781e-06, 'ppl': 7.73689, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.4479675292969, 'epoch': 1.69, 'tokens/total': 1163264.0, 'tokens/trainable': 841071.0}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 71/84 [18:18<03:16, 15.14s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 72/84 [18:33<03:01, 15.15s/it]                                                                                                                                                                                                                {'loss': 2.2584, 'grad_norm': 0.769206166267395, 'learning_rate': 6.315789050859166e-06, 'ppl': 9.56777, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.147216796875, 'epoch': 1.71, 'tokens/total': 1179648.0, 'tokens/trainable': 853505.0}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 72/84 [18:33<03:01, 15.15s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 73/84 [18:48<02:46, 15.13s/it]                                                                                                                                                                                                                {'loss': 2.2368, 'grad_norm': 0.8090022206306458, 'learning_rate': 6.0773477343900595e-06, 'ppl': 9.36332, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 312.2845153808594, 'epoch': 1.74, 'tokens/total': 1196032.0, 'tokens/trainable': 864232.0}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 73/84 [18:48<02:46, 15.13s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 74/84 [19:03<02:31, 15.13s/it]                                                                                                                                                                                                                {'loss': 2.2596, 'grad_norm': 0.7845657467842102, 'learning_rate': 5.81395306653576e-06, 'ppl': 9.57926, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.0311279296875, 'epoch': 1.76, 'tokens/total': 1212416.0, 'tokens/trainable': 876412.0}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 74/84 [19:03<02:31, 15.13s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 75/84 [19:18<02:15, 15.10s/it]                                                                                                                                                                                                                {'loss': 2.0239, 'grad_norm': 0.8282763361930847, 'learning_rate': 5.52147184862406e-06, 'ppl': 7.56778, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.2695617675781, 'epoch': 1.79, 'tokens/total': 1228800.0, 'tokens/trainable': 887204.0}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 75/84 [19:18<02:15, 15.10s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 76/84 [19:33<02:00, 15.11s/it]                                                                                                                                                                                                                {'loss': 2.0459, 'grad_norm': 0.8150551319122314, 'learning_rate': 5.19480499860947e-06, 'ppl': 7.73612, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.22265625, 'epoch': 1.81, 'tokens/total': 1245184.0, 'tokens/trainable': 898698.0}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 76/84 [19:33<02:00, 15.11s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 77/84 [19:48<01:46, 15.16s/it]                                                                                                                                                                                                                {'loss': 2.3338, 'grad_norm': 0.8316982388496399, 'learning_rate': 4.82758605357958e-06, 'ppl': 10.31707, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 442.561279296875, 'epoch': 1.83, 'tokens/total': 1261568.0, 'tokens/trainable': 910126.0}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 77/84 [19:48<01:46, 15.16s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 78/84 [20:04<01:30, 15.16s/it]                                                                                                                                                                                                                {'loss': 2.3689, 'grad_norm': 0.8292647004127502, 'learning_rate': 4.411764621181646e-06, 'ppl': 10.68563, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.56951904296875, 'epoch': 1.86, 'tokens/total': 1277952.0, 'tokens/trainable': 922871.0}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 78/84 [20:04<01:30, 15.16s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 79/84 [20:19<01:15, 15.17s/it]                                                                                                                                                                                                                {'loss': 2.2947, 'grad_norm': 0.7834141850471497, 'learning_rate': 3.937007477361476e-06, 'ppl': 9.92146, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 476.15325927734375, 'epoch': 1.88, 'tokens/total': 1294336.0, 'tokens/trainable': 935425.0}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 79/84 [20:19<01:15, 15.17s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 80/84 [20:34<01:00, 15.18s/it]                                                                                                                                                                                                                {'loss': 2.0461, 'grad_norm': 0.7962015271186829, 'learning_rate': 3.3898304536705837e-06, 'ppl': 7.73767, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.98101806640625, 'epoch': 1.9, 'tokens/total': 1310720.0, 'tokens/trainable': 947062.0}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 80/84 [20:34<01:00, 15.18s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 81/84 [20:49<00:45, 15.18s/it]                                                                                                                                                                                                                {'loss': 2.2245, 'grad_norm': 0.7918537259101868, 'learning_rate': 2.7522935397428228e-06, 'ppl': 9.24886, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 347.5594787597656, 'epoch': 1.93, 'tokens/total': 1327104.0, 'tokens/trainable': 959572.0}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 81/84 [20:49<00:45, 15.18s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 82/84 [21:04<00:30, 15.19s/it]                                                                                                                                                                                                                {'loss': 2.4159, 'grad_norm': 0.7803449630737305, 'learning_rate': 1.9999999949504854e-06, 'ppl': 11.19985, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 441.7668762207031, 'epoch': 1.95, 'tokens/total': 1343488.0, 'tokens/trainable': 973127.0}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 82/84 [21:04<00:30, 15.19s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 83/84 [21:20<00:15, 15.19s/it]                                                                                                                                                                                                                {'loss': 2.13, 'grad_norm': 0.7520664930343628, 'learning_rate': 1.0989010661432985e-06, 'ppl': 8.41487, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 448.2036437988281, 'epoch': 1.98, 'tokens/total': 1359872.0, 'tokens/trainable': 985757.0}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 83/84 [21:20<00:15, 15.19s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [21:35<00:00, 15.20s/it]                                                                                                                                                                                                                {'loss': 2.2931, 'grad_norm': 0.9194205403327942, 'learning_rate': 0.0, 'ppl': 9.9056, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 277.64056396484375, 'epoch': 2.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [21:35<00:00, 15.20s/it][2026-01-04 00:34:20,737] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-84
                                                                                                                                                                                                                {'train_runtime': 1438.4013, 'train_samples_per_second': 0.234, 'train_steps_per_second': 0.058, 'train_loss': 2.380285389366604, 'memory/max_active (GiB)': 15.96, 'memory/max_allocated (GiB)': 15.96, 'memory/device_reserved (GiB)': 20.93, 'epoch': 2.0, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [21:36<00:00, 15.20s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [21:36<00:00, 15.44s/it]
[2026-01-04 00:34:22,468] [INFO] [axolotl.train.save_trained_model:233] [PID:15692] Training completed! Saving trained model to ./output.
[2026-01-04 00:34:23,290] [INFO] [axolotl.train.save_trained_model:351] [PID:15692] Model successfully saved to ./output
[0m