tankimi-l33-adpt / debug.log

Upload folder using huggingface_hub

4dc4124 verified about 1 month ago

127 kB

	[2026-01-04 00:10:03,594] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:15692] bf16 support detected, enabling for this configuration.
	[2026-01-04 00:10:03,667] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:15692] baseline 0.000GB ()
	[2026-01-04 00:10:03,668] [INFO] [axolotl.cli.config.load_cfg:259] [PID:15692] config:
	{
	"activation_offloading": true,
	"adapter": "lora",
	"axolotl_config_path": "train.yml",
	"base_model": "shb777/Llama-3.3-8B-Instruct-128K",
	"base_model_config": "shb777/Llama-3.3-8B-Instruct-128K",
	"batch_size": 4,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"chat_template": "llama3",
	"context_parallel_size": 1,
	"cut_cross_entropy": true,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 96,
	"dataset_prepared_path": "dataset_prepareds",
	"datasets": [
	{
	"chat_template": "tokenizer_default",
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "WokeAI/polititune-tankie-warmup",
	"split": "train",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.8.0"
	},
	"eval_batch_size": 2,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_sample_packing": false,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"flash_attention": true,
	"fp16": false,
	"gradient_accumulation_steps": 2,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": true
	},
	"group_by_length": false,
	"include_tkps": true,
	"is_llama_derived_model": true,
	"learning_rate": 1e-05,
	"liger_fused_linear_cross_entropy": false,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_alpha": 16,
	"lora_dropout": 0.35,
	"lora_r": 64,
	"lora_target_linear": true,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "rex",
	"max_grad_norm": 0.1,
	"mean_resizing_embeddings": false,
	"micro_batch_size": 2,
	"model_config_type": "llama",
	"num_epochs": 2.0,
	"optimizer": "adamw_torch_8bit",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./output",
	"pad_to_sequence_len": true,
	"peft_use_rslora": true,
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin",
	"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
	],
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"remove_unused_columns": false,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": true,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"sequence_len": 4096,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"special_tokens": {
	"pad_token": "<\|reserved_special_token_2\|>"
	},
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tf32": false,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "shb777/Llama-3.3-8B-Instruct-128K",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_otel_metrics": false,
	"use_ray": false,
	"use_wandb": true,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"wandb_project": "newyear",
	"weight_decay": 0.0,
	"world_size": 1
	}
	[2026-01-04 00:10:03,672] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:15692] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
	[2026-01-04 00:10:04,197] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <\|eot_id\|>
	[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <\|begin_of_text\|>
	[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <\|reserved_special_token_2\|>
	[2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None
	[2026-01-04 00:10:04,199] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:15692] Loading prepared dataset from disk at dataset_prepareds/a420619428aa6c5576289a496238883a...
	[2026-01-04 00:10:04,213] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:15692] total_num_tokens: 684_427
	[2026-01-04 00:10:04,225] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:15692] `total_supervised_tokens: 498_319`
	[2026-01-04 00:10:04,242] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
	[2026-01-04 00:10:05,245] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
	[2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.21780610084533691
	[2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
	[2026-01-04 00:10:05,685] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.22262859344482422
	[2026-01-04 00:10:05,686] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
	[2026-01-04 00:10:05,930] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.2447659969329834
	[2026-01-04 00:10:05,931] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially.
	[2026-01-04 00:10:06,188] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.25751280784606934
	[2026-01-04 00:10:06,211] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84]
	[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:494] [PID:15692] data_loader_len: 42
	[2026-01-04 00:10:06,212] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:510] [PID:15692] sample_packing_eff_est across ranks: [0.9946216401599702]
	[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:15692] sample_packing_eff_est: 1.0
	[2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:15692] total_num_steps: 84
	[2026-01-04 00:10:06,212] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:15692] Maximum number of steps set at 84
	[2026-01-04 00:10:06,240] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:15692] loading tokenizer... shb777/Llama-3.3-8B-Instruct-128K
	[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <\|eot_id\|>
	[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <\|begin_of_text\|>
	[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <\|reserved_special_token_2\|>
	[2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None
	[2026-01-04 00:10:06,719] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:15692] Loading model
	[2026-01-04 00:10:06,766] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:15692] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-01-04 00:10:06,767] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:15692] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-01-04 00:10:06,768] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:344] [PID:15692] Applying multipack dataloader patch for sample packing...
	[2026-01-04 00:10:06,873] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:15692] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': False, 'rms_norm': True, 'swiglu': True}
	[2026-01-04 00:10:07,074] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:15692] Applying Cut Cross Entropy to model type: llama
	Loading checkpoint shards: 0%\| \| 0/4 [00:00<?, ?it/s] Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:00<00:00, 133.41it/s]
	[2026-01-04 00:10:09,611] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:15692] Converting modules to torch.bfloat16
	[2026-01-04 00:10:10,928] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] Memory usage after model load 0.000GB ()
	[2026-01-04 00:10:10,929] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:15692] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
	trainable params: 167,772,160 \|\| all params: 8,198,033,408 \|\| trainable%: 2.0465
	[2026-01-04 00:10:12,458] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] after adapters 0.000GB ()
	[2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:413] [PID:15692] Pre-saving adapter config to ./output...
	[2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:417] [PID:15692] Pre-saving tokenizer to ./output...
	[2026-01-04 00:10:20,929] [INFO] [axolotl.train.save_initial_configs:422] [PID:15692] Pre-saving model config to ./output...
	[2026-01-04 00:10:20,931] [INFO] [axolotl.train.execute_training:212] [PID:15692] Starting trainer...
	[2026-01-04 00:10:22,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.49930906295776367
	[2026-01-04 00:10:22,811] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.47399091720581055
	[2026-01-04 00:10:23,297] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.4854564666748047
	[2026-01-04 00:10:23,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.5046358108520508
	[2026-01-04 00:10:23,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84]
	[34m[1mwandb[0m: (1) Create a W&B account
	[34m[1mwandb[0m: (2) Use an existing W&B account
	[34m[1mwandb[0m: (3) Don't visualize my results
	[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
	[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
	[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
	[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
	[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
	[34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
	[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
	[Am[2K [34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
	[Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.23.1
	[34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20260104_001243-myor4kbd[0m
	[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
	[34m[1mwandb[0m: Syncing run [33mdistinctive-firebrand-5[0m
	[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fizzzz/newyear[0m
	[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/fizzzz/newyear/runs/myor4kbd[0m
	[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
	[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
	[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
	[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
	[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
	[2026-01-04 00:12:45,380] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:15692] The Axolotl config has been saved to the WandB run under files.
	0%\| \| 0/84 [00:00<?, ?it/s] 1%\|██ \| 1/84 [00:36<50:25, 36.45s/it] {'loss': 3.0634, 'grad_norm': 8.193868637084961, 'learning_rate': 4.999999873689376e-06, 'ppl': 21.40019, 'memory/max_active (GiB)': 20.05, 'memory/max_allocated (GiB)': 20.05, 'memory/device_reserved (GiB)': 20.62, 'tokens/train_per_sec_per_gpu': 155.38671875, 'epoch': 0.02, 'tokens/total': 16384.0, 'tokens/trainable': 9972.0}
	1%\|██ \| 1/84 [00:36<50:25, 36.45s/it] 2%\|████ \| 2/84 [00:53<33:55, 24.83s/it] {'loss': 3.2197, 'grad_norm': 8.957365036010742, 'learning_rate': 9.999999747378752e-06, 'ppl': 25.02061, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.190185546875, 'epoch': 0.05, 'tokens/total': 32768.0, 'tokens/trainable': 21140.0}
	2%\|████ \| 2/84 [00:53<33:55, 24.83s/it] 4%\|██████▏ \| 3/84 [01:08<27:24, 20.31s/it] {'loss': 3.0878, 'grad_norm': 7.201009273529053, 'learning_rate': 9.987669727706816e-06, 'ppl': 21.92878, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 446.0367736816406, 'epoch': 0.07, 'tokens/total': 49152.0, 'tokens/trainable': 33455.0}
	4%\|██████▏ \| 3/84 [01:08<27:24, 20.31s/it] 5%\|████████▏ \| 4/84 [01:22<24:13, 18.17s/it] {'loss': 3.0947, 'grad_norm': 6.2549729347229, 'learning_rate': 9.97506231215084e-06, 'ppl': 22.08061, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 355.3751220703125, 'epoch': 0.1, 'tokens/total': 65536.0, 'tokens/trainable': 44394.0}
	5%\|████████▏ \| 4/84 [01:22<24:13, 18.17s/it] 6%\|██████████▏ \| 5/84 [01:39<23:21, 17.74s/it] {'loss': 2.6876, 'grad_norm': 5.082422733306885, 'learning_rate': 9.962168405763805e-06, 'ppl': 14.69636, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 351.228515625, 'epoch': 0.12, 'tokens/total': 81920.0, 'tokens/trainable': 56961.0}
	6%\|██████████▏ \| 5/84 [01:39<23:21, 17.74s/it] 7%\|████████████▎ \| 6/84 [01:54<21:49, 16.78s/it] {'loss': 3.0661, 'grad_norm': 4.808957099914551, 'learning_rate': 9.948978913598694e-06, 'ppl': 21.45805, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 418.3861083984375, 'epoch': 0.14, 'tokens/total': 98304.0, 'tokens/trainable': 69788.0}
	7%\|████████████▎ \| 6/84 [01:54<21:49, 16.78s/it] 8%\|██████████████▎ \| 7/84 [02:09<20:46, 16.19s/it] {'loss': 2.7881, 'grad_norm': 3.901789426803589, 'learning_rate': 9.935483831213787e-06, 'ppl': 16.25012, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.1433410644531, 'epoch': 0.17, 'tokens/total': 114688.0, 'tokens/trainable': 81854.0}
	8%\|██████████████▎ \| 7/84 [02:09<20:46, 16.19s/it] 10%\|████████████████▍ \| 8/84 [02:24<20:00, 15.80s/it] {'loss': 2.556, 'grad_norm': 3.0737860202789307, 'learning_rate': 9.921671335177962e-06, 'ppl': 12.88418, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 393.31671142578125, 'epoch': 0.19, 'tokens/total': 131072.0, 'tokens/trainable': 94072.0}
	10%\|████████████████▍ \| 8/84 [02:24<20:00, 15.80s/it] 11%\|██████████████████▍ \| 9/84 [02:39<19:25, 15.54s/it] {'loss': 2.8264, 'grad_norm': 2.6989502906799316, 'learning_rate': 9.907529602060094e-06, 'ppl': 16.88457, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.95263671875, 'epoch': 0.21, 'tokens/total': 147456.0, 'tokens/trainable': 106403.0}
	11%\|██████████████████▍ \| 9/84 [02:39<19:25, 15.54s/it] 12%\|████████████████████▎ \| 10/84 [02:54<18:59, 15.39s/it] {'loss': 2.7276, 'grad_norm': 1.9797707796096802, 'learning_rate': 9.893047717923764e-06, 'ppl': 15.29613, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.0211181640625, 'epoch': 0.24, 'tokens/total': 163840.0, 'tokens/trainable': 119564.0}
	12%\|████████████████████▎ \| 10/84 [02:54<18:59, 15.39s/it] 13%\|██████████████████████▍ \| 11/84 [03:09<18:36, 15.29s/it] {'loss': 2.7648, 'grad_norm': 1.7535356283187866, 'learning_rate': 9.878213859337848e-06, 'ppl': 15.87586, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.7185974121094, 'epoch': 0.26, 'tokens/total': 180224.0, 'tokens/trainable': 130051.0}
	13%\|██████████████████████▍ \| 11/84 [03:09<18:36, 15.29s/it] 14%\|████████████████████████▍ \| 12/84 [03:24<18:16, 15.23s/it] {'loss': 2.7597, 'grad_norm': 1.589630126953125, 'learning_rate': 9.863013474387117e-06, 'ppl': 15.7951, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 436.2579345703125, 'epoch': 0.29, 'tokens/total': 196608.0, 'tokens/trainable': 141402.0}
	14%\|████████████████████████▍ \| 12/84 [03:24<18:16, 15.23s/it] 15%\|██████████████████████████▍ \| 13/84 [03:39<17:56, 15.16s/it] {'loss': 2.371, 'grad_norm': 1.2782939672470093, 'learning_rate': 9.847433830145746e-06, 'ppl': 10.7081, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 380.26220703125, 'epoch': 0.31, 'tokens/total': 212992.0, 'tokens/trainable': 152557.0}
	15%\|██████████████████████████▍ \| 13/84 [03:39<17:56, 15.16s/it] 17%\|████████████████████████████▌ \| 14/84 [03:55<17:39, 15.13s/it] {'loss': 2.3513, 'grad_norm': 1.4143847227096558, 'learning_rate': 9.831460374698509e-06, 'ppl': 10.49921, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 394.7943115234375, 'epoch': 0.33, 'tokens/total': 229376.0, 'tokens/trainable': 165183.0}
	17%\|████████████████████████████▌ \| 14/84 [03:55<17:39, 15.13s/it] 18%\|██████████████████████████████▌ \| 15/84 [04:10<17:23, 15.13s/it] {'loss': 2.7007, 'grad_norm': 1.272851586341858, 'learning_rate': 9.815078556130175e-06, 'ppl': 14.89015, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.2015380859375, 'epoch': 0.36, 'tokens/total': 245760.0, 'tokens/trainable': 176902.0}
	18%\|██████████████████████████████▌ \| 15/84 [04:10<17:23, 15.13s/it] 19%\|████████████████████████████████▌ \| 16/84 [04:25<17:07, 15.11s/it] {'loss': 2.6622, 'grad_norm': 1.1848429441452026, 'learning_rate': 9.79827109404141e-06, 'ppl': 14.32778, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.4239501953125, 'epoch': 0.38, 'tokens/total': 262144.0, 'tokens/trainable': 189732.0}
	19%\|████████████████████████████████▌ \| 16/84 [04:25<17:07, 15.11s/it] 20%\|██████████████████████████████████▌ \| 17/84 [04:40<16:50, 15.08s/it] {'loss': 2.7354, 'grad_norm': 1.4691566228866577, 'learning_rate': 9.781021617527585e-06, 'ppl': 15.41591, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 288.30322265625, 'epoch': 0.4, 'tokens/total': 278528.0, 'tokens/trainable': 199657.0}
	20%\|██████████████████████████████████▌ \| 17/84 [04:40<16:50, 15.08s/it] 21%\|████████████████████████████████████▋ \| 18/84 [04:55<16:33, 15.06s/it] {'loss': 2.5652, 'grad_norm': 1.199852466583252, 'learning_rate': 9.763312846189365e-06, 'ppl': 13.00326, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 437.531982421875, 'epoch': 0.43, 'tokens/total': 294912.0, 'tokens/trainable': 211618.0}
	21%\|████████████████████████████████████▋ \| 18/84 [04:55<16:33, 15.06s/it] 23%\|██████████████████████████████████████▋ \| 19/84 [05:10<16:18, 15.05s/it] {'loss': 2.4957, 'grad_norm': 1.1276423931121826, 'learning_rate': 9.745127499627415e-06, 'ppl': 12.13022, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.6016540527344, 'epoch': 0.45, 'tokens/total': 311296.0, 'tokens/trainable': 223722.0}
	23%\|██████████████████████████████████████▋ \| 19/84 [05:10<16:18, 15.05s/it] 24%\|████████████████████████████████████████▋ \| 20/84 [05:25<16:03, 15.05s/it] {'loss': 2.5416, 'grad_norm': 1.1429506540298462, 'learning_rate': 9.726443749968894e-06, 'ppl': 12.69997, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 474.4263000488281, 'epoch': 0.48, 'tokens/total': 327680.0, 'tokens/trainable': 236639.0}
	24%\|████████████████████████████████████████▋ \| 20/84 [05:25<16:03, 15.05s/it] 25%\|██████████████████████████████████████████▊ \| 21/84 [05:40<15:48, 15.05s/it] {'loss': 2.437, 'grad_norm': 0.9757459759712219, 'learning_rate': 9.707241588330362e-06, 'ppl': 11.43867, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.1600341796875, 'epoch': 0.5, 'tokens/total': 344064.0, 'tokens/trainable': 248033.0}
	25%\|██████████████████████████████████████████▊ \| 21/84 [05:40<15:48, 15.05s/it] 26%\|████████████████████████████████████████████▊ \| 22/84 [05:55<15:34, 15.08s/it] {'loss': 2.2622, 'grad_norm': 0.9250922799110413, 'learning_rate': 9.687500096333679e-06, 'ppl': 9.6042, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 341.8017578125, 'epoch': 0.52, 'tokens/total': 360448.0, 'tokens/trainable': 259928.0}
	26%\|████████████████████████████████████████████▊ \| 22/84 [05:55<15:34, 15.08s/it] 27%\|██████████████████████████████████████████████▊ \| 23/84 [06:10<15:20, 15.10s/it] {'loss': 2.6973, 'grad_norm': 1.0023396015167236, 'learning_rate': 9.667194717621896e-06, 'ppl': 14.83961, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 456.94940185546875, 'epoch': 0.55, 'tokens/total': 376832.0, 'tokens/trainable': 273229.0}
	27%\|██████████████████████████████████████████████▊ \| 23/84 [06:10<15:20, 15.10s/it] 29%\|████████████████████████████████████████████████▊ \| 24/84 [06:25<15:05, 15.09s/it] {'loss': 2.4554, 'grad_norm': 0.8203016519546509, 'learning_rate': 9.646301805332769e-06, 'ppl': 11.65109, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 377.00543212890625, 'epoch': 0.57, 'tokens/total': 393216.0, 'tokens/trainable': 285582.0}
	29%\|████████████████████████████████████████████████▊ \| 24/84 [06:25<15:05, 15.09s/it] 30%\|██████████████████████████████████████████████████▉ \| 25/84 [06:40<14:48, 15.06s/it] {'loss': 2.4833, 'grad_norm': 0.9041063785552979, 'learning_rate': 9.624795893614646e-06, 'ppl': 11.98074, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 307.10601806640625, 'epoch': 0.6, 'tokens/total': 409600.0, 'tokens/trainable': 296666.0}
	30%\|██████████████████████████████████████████████████▉ \| 25/84 [06:40<14:48, 15.06s/it] 31%\|████████████████████████████████████████████████████▉ \| 26/84 [06:55<14:34, 15.07s/it] {'loss': 2.5628, 'grad_norm': 0.9094843864440918, 'learning_rate': 9.602648788131773e-06, 'ppl': 12.97209, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 424.59381103515625, 'epoch': 0.62, 'tokens/total': 425984.0, 'tokens/trainable': 309496.0}
	31%\|████████████████████████████████████████████████████▉ \| 26/84 [06:55<14:34, 15.07s/it] 32%\|██████████████████████████████████████████████████████▉ \| 27/84 [07:10<14:19, 15.07s/it] {'loss': 2.5322, 'grad_norm': 0.9065321683883667, 'learning_rate': 9.579831385053694e-06, 'ppl': 12.58115, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.19561767578125, 'epoch': 0.64, 'tokens/total': 442368.0, 'tokens/trainable': 323050.0}
	32%\|██████████████████████████████████████████████████████▉ \| 27/84 [07:10<14:19, 15.07s/it] 33%\|█████████████████████████████████████████████████████████ \| 28/84 [07:25<14:03, 15.06s/it] {'loss': 2.4975, 'grad_norm': 0.9096329212188721, 'learning_rate': 9.55631367105525e-06, 'ppl': 12.15208, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 276.995361328125, 'epoch': 0.67, 'tokens/total': 458752.0, 'tokens/trainable': 332229.0}
	33%\|█████████████████████████████████████████████████████████ \| 28/84 [07:25<14:03, 15.06s/it] 35%\|███████████████████████████████████████████████████████████ \| 29/84 [07:41<13:48, 15.07s/it] {'loss': 2.2896, 'grad_norm': 0.811943531036377, 'learning_rate': 9.532061994832475e-06, 'ppl': 9.87099, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.5038146972656, 'epoch': 0.69, 'tokens/total': 475136.0, 'tokens/trainable': 342910.0}
	35%\|███████████████████████████████████████████████████████████ \| 29/84 [07:41<13:48, 15.07s/it] 36%\|█████████████████████████████████████████████████████████████ \| 30/84 [07:56<13:32, 15.05s/it] {'loss': 2.1711, 'grad_norm': 0.7714135646820068, 'learning_rate': 9.507041795586701e-06, 'ppl': 8.76792, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.57464599609375, 'epoch': 0.71, 'tokens/total': 491520.0, 'tokens/trainable': 354990.0}
	36%\|█████████████████████████████████████████████████████████████ \| 30/84 [07:56<13:32, 15.05s/it] 37%\|███████████████████████████████████████████████████████████████ \| 31/84 [08:11<13:18, 15.06s/it] {'loss': 2.5228, 'grad_norm': 0.7664998769760132, 'learning_rate': 9.48121669352986e-06, 'ppl': 12.46345, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.0357666015625, 'epoch': 0.74, 'tokens/total': 507904.0, 'tokens/trainable': 368404.0}
	37%\|███████████████████████████████████████████████████████████████ \| 31/84 [08:11<13:18, 15.06s/it] 38%\|█████████████████████████████████████████████████████████████████▏ \| 32/84 [08:26<13:03, 15.06s/it] {'loss': 2.4413, 'grad_norm': 0.7703967094421387, 'learning_rate': 9.454544851905666e-06, 'ppl': 11.48797, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.70343017578125, 'epoch': 0.76, 'tokens/total': 524288.0, 'tokens/trainable': 381274.0}
	38%\|█████████████████████████████████████████████████████████████████▏ \| 32/84 [08:26<13:03, 15.06s/it] 39%\|███████████████████████████████████████████████████████████████████▏ \| 33/84 [08:41<12:47, 15.04s/it] {'loss': 2.3928, 'grad_norm': 0.8420023918151855, 'learning_rate': 9.426987162441947e-06, 'ppl': 10.94409, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 350.9320983886719, 'epoch': 0.79, 'tokens/total': 540672.0, 'tokens/trainable': 392843.0}
	39%\|███████████████████████████████████████████████████████████████████▏ \| 33/84 [08:41<12:47, 15.04s/it] 40%\|█████████████████████████████████████████████████████████████████████▏ \| 34/84 [08:56<12:33, 15.07s/it] {'loss': 2.4511, 'grad_norm': 0.7319009900093079, 'learning_rate': 9.398496331414208e-06, 'ppl': 11.6011, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 461.1392822265625, 'epoch': 0.81, 'tokens/total': 557056.0, 'tokens/trainable': 406127.0}
	40%\|█████████████████████████████████████████████████████████████████████▏ \| 34/84 [08:56<12:33, 15.07s/it] 42%\|███████████████████████████████████████████████████████████████████████▎ \| 35/84 [09:11<12:18, 15.07s/it] {'loss': 2.6036, 'grad_norm': 0.845214307308197, 'learning_rate': 9.369024155603256e-06, 'ppl': 13.51229, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.6831359863281, 'epoch': 0.83, 'tokens/total': 573440.0, 'tokens/trainable': 416879.0}
	42%\|███████████████████████████████████████████████████████████████████████▎ \| 35/84 [09:11<12:18, 15.07s/it] 43%\|█████████████████████████████████████████████████████████████████████████▎ \| 36/84 [09:26<12:02, 15.06s/it] {'loss': 2.3314, 'grad_norm': 0.7980889678001404, 'learning_rate': 9.338521522295196e-06, 'ppl': 10.29234, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 359.2618103027344, 'epoch': 0.86, 'tokens/total': 589824.0, 'tokens/trainable': 428113.0}
	43%\|█████████████████████████████████████████████████████████████████████████▎ \| 36/84 [09:26<12:02, 15.06s/it] 44%\|███████████████████████████████████████████████████████████████████████████▎ \| 37/84 [09:41<11:49, 15.09s/it] {'loss': 2.4644, 'grad_norm': 0.7808263897895813, 'learning_rate': 9.306930223829113e-06, 'ppl': 11.75643, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.8631591796875, 'epoch': 0.88, 'tokens/total': 606208.0, 'tokens/trainable': 441366.0}
	44%\|███████████████████████████████████████████████████████████████████████████▎ \| 37/84 [09:41<11:49, 15.09s/it] 45%\|█████████████████████████████████████████████████████████████████████████████▎ \| 38/84 [09:56<11:33, 15.07s/it] {'loss': 1.9823, 'grad_norm': 0.7255628705024719, 'learning_rate': 9.274192962038796e-06, 'ppl': 7.25942, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.92559814453125, 'epoch': 0.9, 'tokens/total': 622592.0, 'tokens/trainable': 453641.0}
	45%\|█████████████████████████████████████████████████████████████████████████████▎ \| 38/84 [09:56<11:33, 15.07s/it] 46%\|███████████████████████████████████████████████████████████████████████████████▍ \| 39/84 [10:11<11:17, 15.05s/it] {'loss': 2.3952, 'grad_norm': 0.8048214316368103, 'learning_rate': 9.240246072295122e-06, 'ppl': 10.97039, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 308.4759521484375, 'epoch': 0.93, 'tokens/total': 638976.0, 'tokens/trainable': 465009.0}
	46%\|███████████████████████████████████████████████████████████████████████████████▍ \| 39/84 [10:11<11:17, 15.05s/it] 48%\|█████████████████████████████████████████████████████████████████████████████████▍ \| 40/84 [10:26<11:02, 15.05s/it] {'loss': 2.119, 'grad_norm': 0.834456741809845, 'learning_rate': 9.205020433000755e-06, 'ppl': 8.32281, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 238.43943786621094, 'epoch': 0.95, 'tokens/total': 655360.0, 'tokens/trainable': 474320.0}
	48%\|█████████████████████████████████████████████████████████████████████████████████▍ \| 40/84 [10:26<11:02, 15.05s/it] 49%\|███████████████████████████████████████████████████████████████████████████████████▍ \| 41/84 [10:41<10:47, 15.05s/it] {'loss': 2.1589, 'grad_norm': 0.7231636047363281, 'learning_rate': 9.168443284579553e-06, 'ppl': 8.6616, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.91583251953125, 'epoch': 0.98, 'tokens/total': 671744.0, 'tokens/trainable': 486804.0}
	49%\|███████████████████████████████████████████████████████████████████████████████████▍ \| 41/84 [10:41<10:47, 15.05s/it] 50%\|█████████████████████████████████████████████████████████████████████████████████████▌ \| 42/84 [10:56<10:32, 15.07s/it] {'loss': 2.2662, 'grad_norm': 0.7623542547225952, 'learning_rate': 9.13043459149776e-06, 'ppl': 9.64269, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 316.7720031738281, 'epoch': 1.0, 'tokens/total': 688128.0, 'tokens/trainable': 498319.0}
	50%\|█████████████████████████████████████████████████████████████████████████████████████▌ \| 42/84 [10:56<10:32, 15.07s/it][2026-01-04 00:23:42,204] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-42
	51%\|███████████████████████████████████████████████████████████████████████████████████████▌ \| 43/84 [11:15<11:01, 16.15s/it] {'loss': 2.364, 'grad_norm': 0.820637583732605, 'learning_rate': 9.09090886125341e-06, 'ppl': 10.6334, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 388.0730285644531, 'epoch': 1.02, 'tokens/total': 704512.0, 'tokens/trainable': 509948.0}
	51%\|███████████████████████████████████████████████████████████████████████████████████████▌ \| 43/84 [11:15<11:01, 16.15s/it] 52%\|█████████████████████████████████████████████████████████████████████████████████████████▌ \| 44/84 [11:30<10:33, 15.83s/it] {'loss': 2.2541, 'grad_norm': 0.7583165168762207, 'learning_rate': 9.049773325386923e-06, 'ppl': 9.52672, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 406.7909851074219, 'epoch': 1.05, 'tokens/total': 720896.0, 'tokens/trainable': 521725.0}
	52%\|█████████████████████████████████████████████████████████████████████████████████████████▌ \| 44/84 [11:30<10:33, 15.83s/it] 54%\|███████████████████████████████████████████████████████████████████████████████████████████▌ \| 45/84 [11:45<10:07, 15.58s/it] {'loss': 2.184, 'grad_norm': 0.7426517009735107, 'learning_rate': 9.006927939481102e-06, 'ppl': 8.88176, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.7640380859375, 'epoch': 1.07, 'tokens/total': 737280.0, 'tokens/trainable': 533462.0}
	54%\|███████████████████████████████████████████████████████████████████████████████████████████▌ \| 45/84 [11:45<10:07, 15.58s/it] 55%\|█████████████████████████████████████████████████████████████████████████████████████████████▋ \| 46/84 [12:00<09:45, 15.42s/it] {'loss': 2.2603, 'grad_norm': 0.7592601776123047, 'learning_rate': 8.962263564171735e-06, 'ppl': 9.58596, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.5147399902344, 'epoch': 1.1, 'tokens/total': 753664.0, 'tokens/trainable': 544963.0}
	55%\|█████████████████████████████████████████████████████████████████████████████████████████████▋ \| 46/84 [12:00<09:45, 15.42s/it] 56%\|███████████████████████████████████████████████████████████████████████████████████████████████▋ \| 47/84 [12:15<09:27, 15.33s/it] {'loss': 2.3293, 'grad_norm': 0.712399423122406, 'learning_rate': 8.915662874642294e-06, 'ppl': 10.27075, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.820068359375, 'epoch': 1.12, 'tokens/total': 770048.0, 'tokens/trainable': 557568.0}
	56%\|███████████████████████████████████████████████████████████████████████████████████████████████▋ \| 47/84 [12:15<09:27, 15.33s/it] 57%\|█████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 48/84 [12:30<09:10, 15.28s/it] {'loss': 2.3301, 'grad_norm': 0.6970159411430359, 'learning_rate': 8.866994903655723e-06, 'ppl': 10.27897, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 479.2054443359375, 'epoch': 1.14, 'tokens/total': 786432.0, 'tokens/trainable': 571098.0}
	57%\|█████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 48/84 [12:30<09:10, 15.28s/it] 58%\|███████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 49/84 [12:46<08:53, 15.24s/it] {'loss': 2.1423, 'grad_norm': 0.8117857575416565, 'learning_rate': 8.81612049852265e-06, 'ppl': 8.51901, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 311.01019287109375, 'epoch': 1.17, 'tokens/total': 802816.0, 'tokens/trainable': 580687.0}
	58%\|███████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 49/84 [12:46<08:53, 15.24s/it] 60%\|█████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 50/84 [13:01<08:37, 15.21s/it] {'loss': 2.287, 'grad_norm': 0.7534531950950623, 'learning_rate': 8.762885954638477e-06, 'ppl': 9.84536, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 460.1733703613281, 'epoch': 1.19, 'tokens/total': 819200.0, 'tokens/trainable': 593415.0}
	60%\|█████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 50/84 [13:01<08:37, 15.21s/it] 61%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 51/84 [13:16<08:21, 15.18s/it] {'loss': 2.2903, 'grad_norm': 0.7555272579193115, 'learning_rate': 8.707123924978077e-06, 'ppl': 9.8779, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 357.5971374511719, 'epoch': 1.21, 'tokens/total': 835584.0, 'tokens/trainable': 605122.0}
	61%\|███████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 51/84 [13:16<08:21, 15.18s/it] 62%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 52/84 [13:31<08:04, 15.16s/it] {'loss': 2.0339, 'grad_norm': 0.7037354707717896, 'learning_rate': 8.648648872622289e-06, 'ppl': 7.64384, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 445.2397155761719, 'epoch': 1.24, 'tokens/total': 851968.0, 'tokens/trainable': 617663.0}
	62%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 52/84 [13:31<08:04, 15.16s/it] 63%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 53/84 [13:46<07:49, 15.14s/it] {'loss': 2.208, 'grad_norm': 0.7909743189811707, 'learning_rate': 8.587257980252616e-06, 'ppl': 9.0975, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 342.66082763671875, 'epoch': 1.26, 'tokens/total': 868352.0, 'tokens/trainable': 628294.0}
	63%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 53/84 [13:46<07:49, 15.14s/it] 64%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 54/84 [14:01<07:33, 15.12s/it] {'loss': 2.0099, 'grad_norm': 0.9540871977806091, 'learning_rate': 8.522727512172423e-06, 'ppl': 7.46257, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.34002685546875, 'epoch': 1.29, 'tokens/total': 884736.0, 'tokens/trainable': 639131.0}
	64%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 54/84 [14:01<07:33, 15.12s/it] 65%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 55/84 [14:16<07:18, 15.11s/it] {'loss': 2.1202, 'grad_norm': 0.7127824425697327, 'learning_rate': 8.454810085822828e-06, 'ppl': 8.3328, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 309.6125793457031, 'epoch': 1.31, 'tokens/total': 901120.0, 'tokens/trainable': 650860.0}
	65%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 55/84 [14:16<07:18, 15.11s/it] 67%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 56/84 [14:31<07:02, 15.08s/it] {'loss': 2.2369, 'grad_norm': 0.8307490944862366, 'learning_rate': 8.383233762288e-06, 'ppl': 9.36426, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 360.55908203125, 'epoch': 1.33, 'tokens/total': 917504.0, 'tokens/trainable': 660335.0}
	67%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 56/84 [14:31<07:02, 15.08s/it] 68%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 57/84 [14:46<06:47, 15.10s/it] {'loss': 2.4058, 'grad_norm': 0.8058661818504333, 'learning_rate': 8.307692041853443e-06, 'ppl': 11.0873, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 369.5503845214844, 'epoch': 1.36, 'tokens/total': 933888.0, 'tokens/trainable': 672664.0}
	68%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 57/84 [14:46<06:47, 15.10s/it] 69%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 58/84 [15:01<06:32, 15.10s/it] {'loss': 2.2731, 'grad_norm': 0.7699398994445801, 'learning_rate': 8.227847501984797e-06, 'ppl': 9.70945, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.2121887207031, 'epoch': 1.38, 'tokens/total': 950272.0, 'tokens/trainable': 685353.0}
	69%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 58/84 [15:01<06:32, 15.10s/it] 70%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 59/84 [15:17<06:17, 15.09s/it] {'loss': 2.2848, 'grad_norm': 0.7645865678787231, 'learning_rate': 8.143322702380829e-06, 'ppl': 9.82372, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 433.5550842285156, 'epoch': 1.4, 'tokens/total': 966656.0, 'tokens/trainable': 697377.0}
	70%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 59/84 [15:17<06:17, 15.09s/it] 71%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 60/84 [15:32<06:01, 15.08s/it] {'loss': 2.2004, 'grad_norm': 0.7868902087211609, 'learning_rate': 8.053691090026405e-06, 'ppl': 9.02862, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 322.94708251953125, 'epoch': 1.43, 'tokens/total': 983040.0, 'tokens/trainable': 708580.0}
	71%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 60/84 [15:32<06:01, 15.08s/it] 73%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 61/84 [15:47<05:47, 15.10s/it] {'loss': 2.1096, 'grad_norm': 0.8960193991661072, 'learning_rate': 7.9584779086872e-06, 'ppl': 8.24494, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.5703125, 'epoch': 1.45, 'tokens/total': 999424.0, 'tokens/trainable': 721379.0}
	73%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 61/84 [15:47<05:47, 15.10s/it] 74%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 62/84 [16:02<05:31, 15.08s/it] {'loss': 1.994, 'grad_norm': 0.7898715138435364, 'learning_rate': 7.857142918510363e-06, 'ppl': 7.34485, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 345.618408203125, 'epoch': 1.48, 'tokens/total': 1015808.0, 'tokens/trainable': 732230.0}
	74%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 62/84 [16:02<05:31, 15.08s/it] 75%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 63/84 [16:17<05:16, 15.09s/it] {'loss': 2.1251, 'grad_norm': 0.7547686100006104, 'learning_rate': 7.749077667540405e-06, 'ppl': 8.37373, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.9949951171875, 'epoch': 1.5, 'tokens/total': 1032192.0, 'tokens/trainable': 743308.0}
	75%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 63/84 [16:17<05:16, 15.09s/it] 76%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 64/84 [16:32<05:01, 15.06s/it] {'loss': 2.2899, 'grad_norm': 0.8282785415649414, 'learning_rate': 7.633587301825173e-06, 'ppl': 9.87395, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 389.5559997558594, 'epoch': 1.52, 'tokens/total': 1048576.0, 'tokens/trainable': 754578.0}
	76%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 64/84 [16:32<05:01, 15.06s/it] 77%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 65/84 [16:47<04:46, 15.07s/it] {'loss': 1.982, 'grad_norm': 0.7019599080085754, 'learning_rate': 7.509881015721476e-06, 'ppl': 7.25724, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 467.1014709472656, 'epoch': 1.55, 'tokens/total': 1064960.0, 'tokens/trainable': 767778.0}
	77%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 65/84 [16:47<04:46, 15.07s/it] 79%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 66/84 [17:02<04:31, 15.06s/it] {'loss': 2.177, 'grad_norm': 0.8358809351921082, 'learning_rate': 7.3770493145275395e-06, 'ppl': 8.81981, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 329.0794982910156, 'epoch': 1.57, 'tokens/total': 1081344.0, 'tokens/trainable': 778410.0}
	79%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 66/84 [17:02<04:31, 15.06s/it] 80%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 67/84 [17:17<04:16, 15.07s/it] {'loss': 1.9627, 'grad_norm': 0.7629905343055725, 'learning_rate': 7.234042186610168e-06, 'ppl': 7.11852, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 318.5070495605469, 'epoch': 1.6, 'tokens/total': 1097728.0, 'tokens/trainable': 790046.0}
	80%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 67/84 [17:17<04:16, 15.07s/it] 81%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 68/84 [17:32<04:01, 15.09s/it] {'loss': 2.2829, 'grad_norm': 0.7605993151664734, 'learning_rate': 7.079645911289845e-06, 'ppl': 9.80507, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 432.19000244140625, 'epoch': 1.62, 'tokens/total': 1114112.0, 'tokens/trainable': 803491.0}
	81%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 68/84 [17:32<04:01, 15.09s/it] 82%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 69/84 [17:47<03:46, 15.11s/it] {'loss': 2.1591, 'grad_norm': 0.7556662559509277, 'learning_rate': 6.912442131579155e-06, 'ppl': 8.66334, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 449.40435791015625, 'epoch': 1.64, 'tokens/total': 1130496.0, 'tokens/trainable': 816919.0}
	82%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 69/84 [17:47<03:46, 15.11s/it] 83%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 70/84 [18:02<03:31, 15.12s/it] {'loss': 2.0868, 'grad_norm': 0.7916563749313354, 'learning_rate': 6.73076920065796e-06, 'ppl': 8.05908, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.95281982421875, 'epoch': 1.67, 'tokens/total': 1146880.0, 'tokens/trainable': 827854.0}
	83%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 70/84 [18:02<03:31, 15.12s/it] 85%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 71/84 [18:18<03:16, 15.14s/it] {'loss': 2.046, 'grad_norm': 0.7467530965805054, 'learning_rate': 6.532663064717781e-06, 'ppl': 7.73689, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.4479675292969, 'epoch': 1.69, 'tokens/total': 1163264.0, 'tokens/trainable': 841071.0}
	85%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 71/84 [18:18<03:16, 15.14s/it] 86%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 72/84 [18:33<03:01, 15.15s/it] {'loss': 2.2584, 'grad_norm': 0.769206166267395, 'learning_rate': 6.315789050859166e-06, 'ppl': 9.56777, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.147216796875, 'epoch': 1.71, 'tokens/total': 1179648.0, 'tokens/trainable': 853505.0}
	86%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 72/84 [18:33<03:01, 15.15s/it] 87%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 73/84 [18:48<02:46, 15.13s/it] {'loss': 2.2368, 'grad_norm': 0.8090022206306458, 'learning_rate': 6.0773477343900595e-06, 'ppl': 9.36332, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 312.2845153808594, 'epoch': 1.74, 'tokens/total': 1196032.0, 'tokens/trainable': 864232.0}
	87%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 73/84 [18:48<02:46, 15.13s/it] 88%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 74/84 [19:03<02:31, 15.13s/it] {'loss': 2.2596, 'grad_norm': 0.7845657467842102, 'learning_rate': 5.81395306653576e-06, 'ppl': 9.57926, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.0311279296875, 'epoch': 1.76, 'tokens/total': 1212416.0, 'tokens/trainable': 876412.0}
	88%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 74/84 [19:03<02:31, 15.13s/it] 89%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 75/84 [19:18<02:15, 15.10s/it] {'loss': 2.0239, 'grad_norm': 0.8282763361930847, 'learning_rate': 5.52147184862406e-06, 'ppl': 7.56778, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.2695617675781, 'epoch': 1.79, 'tokens/total': 1228800.0, 'tokens/trainable': 887204.0}
	89%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 75/84 [19:18<02:15, 15.10s/it] 90%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 76/84 [19:33<02:00, 15.11s/it] {'loss': 2.0459, 'grad_norm': 0.8150551319122314, 'learning_rate': 5.19480499860947e-06, 'ppl': 7.73612, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.22265625, 'epoch': 1.81, 'tokens/total': 1245184.0, 'tokens/trainable': 898698.0}
	90%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 76/84 [19:33<02:00, 15.11s/it] 92%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 77/84 [19:48<01:46, 15.16s/it] {'loss': 2.3338, 'grad_norm': 0.8316982388496399, 'learning_rate': 4.82758605357958e-06, 'ppl': 10.31707, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 442.561279296875, 'epoch': 1.83, 'tokens/total': 1261568.0, 'tokens/trainable': 910126.0}
	92%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 77/84 [19:48<01:46, 15.16s/it] 93%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 78/84 [20:04<01:30, 15.16s/it] {'loss': 2.3689, 'grad_norm': 0.8292647004127502, 'learning_rate': 4.411764621181646e-06, 'ppl': 10.68563, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.56951904296875, 'epoch': 1.86, 'tokens/total': 1277952.0, 'tokens/trainable': 922871.0}
	93%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 78/84 [20:04<01:30, 15.16s/it] 94%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 79/84 [20:19<01:15, 15.17s/it] {'loss': 2.2947, 'grad_norm': 0.7834141850471497, 'learning_rate': 3.937007477361476e-06, 'ppl': 9.92146, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 476.15325927734375, 'epoch': 1.88, 'tokens/total': 1294336.0, 'tokens/trainable': 935425.0}
	94%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 79/84 [20:19<01:15, 15.17s/it] 95%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 80/84 [20:34<01:00, 15.18s/it] {'loss': 2.0461, 'grad_norm': 0.7962015271186829, 'learning_rate': 3.3898304536705837e-06, 'ppl': 7.73767, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.98101806640625, 'epoch': 1.9, 'tokens/total': 1310720.0, 'tokens/trainable': 947062.0}
	95%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 80/84 [20:34<01:00, 15.18s/it] 96%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 81/84 [20:49<00:45, 15.18s/it] {'loss': 2.2245, 'grad_norm': 0.7918537259101868, 'learning_rate': 2.7522935397428228e-06, 'ppl': 9.24886, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 347.5594787597656, 'epoch': 1.93, 'tokens/total': 1327104.0, 'tokens/trainable': 959572.0}
	96%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 81/84 [20:49<00:45, 15.18s/it] 98%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 82/84 [21:04<00:30, 15.19s/it] {'loss': 2.4159, 'grad_norm': 0.7803449630737305, 'learning_rate': 1.9999999949504854e-06, 'ppl': 11.19985, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 441.7668762207031, 'epoch': 1.95, 'tokens/total': 1343488.0, 'tokens/trainable': 973127.0}
	98%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 82/84 [21:04<00:30, 15.19s/it] 99%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 83/84 [21:20<00:15, 15.19s/it] {'loss': 2.13, 'grad_norm': 0.7520664930343628, 'learning_rate': 1.0989010661432985e-06, 'ppl': 8.41487, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 448.2036437988281, 'epoch': 1.98, 'tokens/total': 1359872.0, 'tokens/trainable': 985757.0}
	99%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 83/84 [21:20<00:15, 15.19s/it] 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 84/84 [21:35<00:00, 15.20s/it] {'loss': 2.2931, 'grad_norm': 0.9194205403327942, 'learning_rate': 0.0, 'ppl': 9.9056, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 277.64056396484375, 'epoch': 2.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0}
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 84/84 [21:35<00:00, 15.20s/it][2026-01-04 00:34:20,737] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-84
	{'train_runtime': 1438.4013, 'train_samples_per_second': 0.234, 'train_steps_per_second': 0.058, 'train_loss': 2.380285389366604, 'memory/max_active (GiB)': 15.96, 'memory/max_allocated (GiB)': 15.96, 'memory/device_reserved (GiB)': 20.93, 'epoch': 2.0, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0}
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 84/84 [21:36<00:00, 15.20s/it] 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 84/84 [21:36<00:00, 15.44s/it]
	[2026-01-04 00:34:22,468] [INFO] [axolotl.train.save_trained_model:233] [PID:15692] Training completed! Saving trained model to ./output.
	[2026-01-04 00:34:23,290] [INFO] [axolotl.train.save_trained_model:351] [PID:15692] Model successfully saved to ./output
	[0m