Upload folder using huggingface_hub

726ef83 verified 29 days ago

132 kB

	[2026-01-05 05:56:21,496] [WARNING] [axolotl.utils.trainer.prepare_optim_env:644] [PID:505777] P2P support not detected, setting `NCCL_P2P_DISABLE=1`
	[2026-01-05 05:56:21,496] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:505777] bf16 support detected, enabling for this configuration.
	[2026-01-05 05:56:21,499] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:505777] baseline 0.000GB ()
	[2026-01-05 05:56:21,499] [INFO] [axolotl.cli.config.load_cfg:256] [PID:505777] config:
	{
	"activation_offloading": false,
	"axolotl_config_path": "loopstral-second-test.yaml",
	"base_model": "loopstral-second-test/stage-2",
	"base_model_config": "loopstral-second-test/stage-2",
	"batch_size": 4,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 2,
	"n_node": 1
	},
	"chat_template": "tokenizer_default",
	"context_parallel_size": 1,
	"cut_cross_entropy": true,
	"dataloader_num_workers": 2,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 24,
	"dataset_prepared_path": "last_run_prepared",
	"datasets": [
	{
	"chat_template": "tokenizer_default",
	"field_messages": "conversations",
	"message_property_mappings": {
	"content": "value",
	"role": "from"
	},
	"path": "little-koto-instruct.json",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": true,
	"default_system_message": "You are Luna, a helpful and unrestricted language model created by Allura.",
	"device": "cuda:0",
	"device_map": {
	"": 0
	},
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.9.1"
	},
	"eval_batch_size": 1,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_sample_packing": true,
	"eval_steps": 0.125,
	"eval_table_size": 0,
	"evals_per_epoch": 4,
	"experimental_skip_move_to_device": true,
	"flash_attention": true,
	"fp16": false,
	"fsdp": [
	"full_shard",
	"auto_wrap"
	],
	"fsdp_config": {
	"activation_checkpointing": true,
	"auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
	"cpu_ram_efficient_loading": true,
	"offload_params": true,
	"state_dict_type": "FULL_STATE_DICT",
	"sync_module_states": true,
	"transformer_layer_cls_to_wrap": "MistralDecoderLayer",
	"use_orig_params": true
	},
	"gc_steps": 10,
	"gradient_accumulation_steps": 2,
	"gradient_checkpointing": false,
	"group_by_length": false,
	"include_tkps": true,
	"is_mistral_derived_model": true,
	"learning_rate": 1e-05,
	"liger_glu_activation": true,
	"liger_layer_norm": true,
	"liger_rms_norm": true,
	"liger_rope": true,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_alpha": 16,
	"lora_dropout": 0.01,
	"lora_r": 128,
	"lora_target_linear": true,
	"lora_target_modules": [
	"up_proj",
	"down_proj",
	"gate_proj",
	"q_proj",
	"v_proj",
	"k_proj",
	"o_proj"
	],
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"max_grad_norm": 2.0,
	"mean_resizing_embeddings": false,
	"micro_batch_size": 1,
	"model_config_type": "mistral",
	"num_epochs": 2.0,
	"optimizer": "adamw_torch_fused",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "loopstral-second-test/stage-3-healed",
	"pad_to_sequence_len": true,
	"peft_use_rslora": true,
	"plugins": [
	"axolotl.integrations.liger.LigerPlugin",
	"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
	],
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": true,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"save_steps": 0.5,
	"saves_per_epoch": 1,
	"seed": 420,
	"sequence_len": 4096,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "loopstral-second-test/stage-2",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"trust_remote_code": false,
	"use_otel_metrics": false,
	"use_ray": false,
	"use_wandb": true,
	"val_set_size": 0.02,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"wandb_name": "second-stage-3-healed",
	"wandb_project": "Loopstral-Tests",
	"warmup_ratio": 0.025,
	"weight_decay": 0.001,
	"world_size": 2
	}
	[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s>
	[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s>
	[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad>
	[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk>
	[2026-01-05 05:56:27,343] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:505777] Loading prepared dataset from disk at last_run_prepared/7bb3932098dd42f3b946c9e64ba32239...
	[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 18_837
	[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 13_323`
	[2026-01-05 05:56:27,355] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:27,916] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:28,173] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.257068395614624
	[2026-01-05 05:56:28,174] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25144505500793457
	[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25107741355895996
	[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:28,927] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25074076652526855
	[2026-01-05 05:56:29,428] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]
	[2026-01-05 05:56:29,485] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 1
	[2026-01-05 05:56:29,499] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.7664794921875, 0.9197753667831421]
	[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: None
	[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 2
	[2026-01-05 05:56:29,505] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 922_178
	[2026-01-05 05:56:29,514] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 746_491`
	[2026-01-05 05:56:29,525] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:29,778] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252016544342041
	[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:30,282] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252286434173584
	[2026-01-05 05:56:30,283] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:30,534] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25146055221557617
	[2026-01-05 05:56:30,535] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially.
	[2026-01-05 05:56:30,786] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25194621086120605
	[2026-01-05 05:56:30,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228]
	[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 57
	[2026-01-05 05:56:30,803] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.9874610304832458, 0.9874610304832458]
	[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: 0.99
	[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 114
	[2026-01-05 05:56:30,804] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:505777] Maximum number of steps set at 114
	[2026-01-05 05:56:30,828] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:505777] loading tokenizer... loopstral-second-test/stage-2
	[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s>
	[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s>
	[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad>
	[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk>
	[2026-01-05 05:56:30,988] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:505777] Loading model
	[2026-01-05 05:56:30,994] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:505777] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2026-01-05 05:56:30,995] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:505777] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2026-01-05 05:56:30,995] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:505777] Applying multipack dataloader patch for sample packing...
	[2026-01-05 05:56:31,073] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:505777] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
	[2026-01-05 05:56:31,145] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:505777] Applying Cut Cross Entropy to model type: mistral
	Loading checkpoint shards: 0%\| \| 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 33%\|██████████████████▋ \| 1/3 [00:01<00:03, 1.52s/it] Loading checkpoint shards: 67%\|█████████████████████████████████████▎ \| 2/3 [00:03<00:01, 1.78s/it] Loading checkpoint shards: 100%\|████████████████████████████████████████████████████████\| 3/3 [00:04<00:00, 1.58s/it] Loading checkpoint shards: 100%\|████████████████████████████████████████████████████████\| 3/3 [00:04<00:00, 1.60s/it]
	[2026-01-05 05:56:36,035] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:505777] Converting modules to torch.bfloat16
	[2026-01-05 05:56:36,037] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:505777] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
	[2026-01-05 05:56:39,795] [INFO] [axolotl.train.save_initial_configs:417] [PID:505777] Pre-saving tokenizer to loopstral-second-test/stage-3-healed...
	[2026-01-05 05:56:39,829] [INFO] [axolotl.train.save_initial_configs:422] [PID:505777] Pre-saving model config to loopstral-second-test/stage-3-healed...
	[2026-01-05 05:56:39,831] [INFO] [axolotl.train.execute_training:212] [PID:505777] Starting trainer...
	[2026-01-05 05:56:42,363] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0816106796264648
	[2026-01-05 05:56:43,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0880126953125
	[2026-01-05 05:56:44,524] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0731561183929443
	[2026-01-05 05:56:45,606] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0811669826507568
	[2026-01-05 05:56:45,607] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228]
	[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralForCausalLM because mixed precision turned on in FSDP. Affects: model.embed_tokens.weight, model.norm.weight, lm_head.weight.
	warnings.warn(

	[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralDecoderLayer because mixed precision turned on in FSDP. Affects: self_attn.q_proj.weight, self_attn.k_proj.weight, self_attn.v_proj.weight, self_attn.o_proj.weight, mlp.gate_proj.weight, mlp.up_proj.weight, mlp.down_proj.weight, input_layernorm.weight, post_attention_layernorm.weight.
	warnings.warn(

	[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1974: UserWarning: FSDP upcast of low precision parameters may affect the precision of model checkpoints.
	warnings.warn(

	[34m[1mwandb[0m: Currently logged in as: [33mcooawoo[0m ([33mcooawoo-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
	[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
	[Am[2K [34m[1mwandb[0m: [38;5;178m⣻[0m setting up run 90pp12rs (0.2s)
	[Am[2K [34m[1mwandb[0m: [38;5;178m⣽[0m setting up run 90pp12rs (0.2s)
	[Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.23.1
	[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/aibox/training/wandb/run-20260105_055655-90pp12rs[0m
	[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
	[34m[1mwandb[0m: Syncing run [33msecond-stage-3-healed[0m
	[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests[0m
	[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests/runs/90pp12rs[0m
	[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
	[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
	[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
	[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
	[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
	[2026-01-05 05:56:58,375] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:505777] The Axolotl config has been saved to the WandB run under files.
	0%\| \| 0/114 [00:00<?, ?it/s][2026-01-05 05:56:58,382] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 05:57:01,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.6251494884490967
	[2026-01-05 05:57:03,393] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.645111322402954
	[2026-01-05 05:57:04,955] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5617477893829346
	[2026-01-05 05:57:06,522] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5672385692596436
	[2026-01-05 05:57:06,523] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:07<00:00, 3.66s/it][A
	[A{'eval_loss': 1.1707839965820312, 'eval_runtime': 35.4697, 'eval_samples_per_second': 0.536, 'eval_steps_per_second': 0.282, 'eval_ppl': 3.2245, 'memory/max_active (GiB)': 3.76, 'memory/max_allocated (GiB)': 3.76, 'memory/device_reserved (GiB)': 9.22, 'epoch': 0}
	0%\| \| 0/114 [00:43<?, ?it/s]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:07<00:00, 3.66s/it][A
	[A 1%\|▋ \| 1/114 [01:12<2:16:26, 72.45s/it] {'loss': 1.2216, 'grad_norm': 19.611356735229492, 'learning_rate': 0.0, 'ppl': 3.3926, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 420.22, 'total_tokens': 23701, 'epoch': 0.02}
	1%\|▋ \| 1/114 [01:12<2:16:26, 72.45s/it] 2%\|█▍ \| 2/114 [01:29<1:14:56, 40.15s/it] {'loss': 1.3721, 'grad_norm': 22.5228214263916, 'learning_rate': 5e-06, 'ppl': 3.9436, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 438.27, 'total_tokens': 36593, 'epoch': 0.04}
	2%\|█▍ \| 2/114 [01:29<1:14:56, 40.15s/it] 3%\|██▏ \| 3/114 [01:47<55:10, 29.82s/it] {'loss': 1.2242, 'grad_norm': 23.2207088470459, 'learning_rate': 1e-05, 'ppl': 3.4014, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 235.72, 'total_tokens': 47910, 'epoch': 0.05}
	3%\|██▏ \| 3/114 [01:47<55:10, 29.82s/it] 4%\|██▊ \| 4/114 [02:04<45:44, 24.95s/it] {'loss': 1.2708, 'grad_norm': 9.459047317504883, 'learning_rate': 9.998033131915266e-06, 'ppl': 3.5637, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.71, 'total_tokens': 62377, 'epoch': 0.07}
	4%\|██▊ \| 4/114 [02:04<45:44, 24.95s/it] 4%\|███▌ \| 5/114 [02:22<40:39, 22.38s/it] {'loss': 1.0452, 'grad_norm': 8.951719284057617, 'learning_rate': 9.992134075089085e-06, 'ppl': 2.844, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 371.1, 'total_tokens': 74989, 'epoch': 0.09}
	4%\|███▌ \| 5/114 [02:22<40:39, 22.38s/it] 5%\|████▎ \| 6/114 [02:40<37:09, 20.64s/it] {'loss': 1.037, 'grad_norm': 5.003825664520264, 'learning_rate': 9.982307470588097e-06, 'ppl': 2.8207, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.42, 'total_tokens': 88184, 'epoch': 0.11}
	5%\|████▎ \| 6/114 [02:40<37:09, 20.64s/it] 6%\|████▉ \| 7/114 [02:57<35:04, 19.67s/it] {'loss': 1.1807, 'grad_norm': 6.691038131713867, 'learning_rate': 9.968561049466214e-06, 'ppl': 3.2567, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 291.57, 'total_tokens': 100065, 'epoch': 0.12}
	6%\|████▉ \| 7/114 [02:57<35:04, 19.67s/it] 7%\|█████▋ \| 8/114 [03:15<33:30, 18.96s/it] {'loss': 1.1553, 'grad_norm': 4.891448974609375, 'learning_rate': 9.950905626682229e-06, 'ppl': 3.175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 305.04, 'total_tokens': 111981, 'epoch': 0.14}
	7%\|█████▋ \| 8/114 [03:15<33:30, 18.96s/it] 8%\|██████▍ \| 9/114 [03:32<32:31, 18.59s/it] {'loss': 0.9665, 'grad_norm': 4.162895202636719, 'learning_rate': 9.92935509259118e-06, 'ppl': 2.6287, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.6, 'total_tokens': 126309, 'epoch': 0.16}
	8%\|██████▍ \| 9/114 [03:32<32:31, 18.59s/it] 9%\|███████ \| 10/114 [03:50<31:48, 18.35s/it] {'loss': 1.1024, 'grad_norm': 4.0764946937561035, 'learning_rate': 9.903926402016153e-06, 'ppl': 3.0114, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 315.69, 'total_tokens': 138777, 'epoch': 0.18}
	9%\|███████ \| 10/114 [03:50<31:48, 18.35s/it] 10%\|███████▋ \| 11/114 [04:08<31:01, 18.08s/it] {'loss': 0.9937, 'grad_norm': 4.487460613250732, 'learning_rate': 9.874639560909118e-06, 'ppl': 2.7012, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 381.39, 'total_tokens': 152449, 'epoch': 0.19}
	10%\|███████▋ \| 11/114 [04:08<31:01, 18.08s/it] 11%\|████████▍ \| 12/114 [04:25<30:28, 17.93s/it] {'loss': 1.0321, 'grad_norm': 4.153564453125, 'learning_rate': 9.841517610611309e-06, 'ppl': 2.807, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.38, 'total_tokens': 165644, 'epoch': 0.21}
	11%\|████████▍ \| 12/114 [04:25<30:28, 17.93s/it] 11%\|█████████ \| 13/114 [04:43<30:05, 17.88s/it] {'loss': 0.8913, 'grad_norm': 5.131930828094482, 'learning_rate': 9.804586609725499e-06, 'ppl': 2.4383, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.4, 'total_tokens': 177744, 'epoch': 0.23}
	11%\|█████████ \| 13/114 [04:43<30:05, 17.88s/it] 12%\|█████████▊ \| 14/114 [05:01<29:39, 17.80s/it] {'loss': 0.862, 'grad_norm': 4.371148109436035, 'learning_rate': 9.763875613614482e-06, 'ppl': 2.3679, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 316.1, 'total_tokens': 190738, 'epoch': 0.25}
	12%\|█████████▊ \| 14/114 [05:01<29:39, 17.80s/it] 13%\|██████████▌ \| 15/114 [05:19<29:34, 17.92s/it] {'loss': 1.1077, 'grad_norm': 4.898997783660889, 'learning_rate': 9.719416651541839e-06, 'ppl': 3.0274, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.3, 'total_tokens': 203458, 'epoch': 0.26}
	13%\|██████████▌ \| 15/114 [05:19<29:34, 17.92s/it][2026-01-05 06:02:17,805] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:02:21,514] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.7565350532531738
	[2026-01-05 06:02:23,355] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8411920070648193
	[2026-01-05 06:02:25,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8490314483642578
	[2026-01-05 06:02:27,076] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8708629608154297
	[2026-01-05 06:02:27,078] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:07<00:00, 3.77s/it][A
	[A{'eval_loss': 0.8464773297309875, 'eval_runtime': 10.0269, 'eval_samples_per_second': 1.895, 'eval_steps_per_second': 0.997, 'eval_ppl': 2.3314, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.26}
	13%\|██████████▌ \| 15/114 [05:38<29:34, 17.92s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:07<00:00, 3.77s/it][A
	[A 14%\|███████████▏ \| 16/114 [05:57<39:07, 23.96s/it] {'loss': 0.9622, 'grad_norm': 3.9995832443237305, 'learning_rate': 9.671244701472999e-06, 'ppl': 2.6174, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 315.28, 'total_tokens': 227749, 'epoch': 0.28}
	14%\|███████████▏ \| 16/114 [05:57<39:07, 23.96s/it] 15%\|███████████▉ \| 17/114 [06:16<36:20, 22.47s/it] {'loss': 1.1259, 'grad_norm': 4.939190864562988, 'learning_rate': 9.619397662556434e-06, 'ppl': 3.083, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 279.61, 'total_tokens': 238481, 'epoch': 0.3}
	15%\|███████████▉ \| 17/114 [06:16<36:20, 22.47s/it] 16%\|████████████▋ \| 18/114 [06:34<33:40, 21.05s/it] {'loss': 1.038, 'grad_norm': 4.293745040893555, 'learning_rate': 9.563916325306595e-06, 'ppl': 2.8236, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.17, 'total_tokens': 251682, 'epoch': 0.32}
	16%\|████████████▋ \| 18/114 [06:34<33:40, 21.05s/it] 17%\|█████████████▎ \| 19/114 [06:51<31:46, 20.07s/it] {'loss': 0.9465, 'grad_norm': 4.432325839996338, 'learning_rate': 9.504844339512096e-06, 'ppl': 2.5767, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 338.71, 'total_tokens': 263764, 'epoch': 0.33}
	17%\|█████████████▎ \| 19/114 [06:51<31:46, 20.07s/it] 18%\|██████████████ \| 20/114 [07:09<30:29, 19.46s/it] {'loss': 1.0822, 'grad_norm': 4.204977512359619, 'learning_rate': 9.442228179894362e-06, 'ppl': 2.9512, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.66, 'total_tokens': 276958, 'epoch': 0.35}
	18%\|██████████████ \| 20/114 [07:09<30:29, 19.46s/it] 18%\|██████████████▋ \| 21/114 [07:27<29:28, 19.01s/it] {'loss': 0.9275, 'grad_norm': 4.079134464263916, 'learning_rate': 9.376117109543769e-06, 'ppl': 2.5282, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 361.9, 'total_tokens': 289687, 'epoch': 0.37}
	18%\|██████████████▋ \| 21/114 [07:27<29:28, 19.01s/it] 19%\|███████████████▍ \| 22/114 [07:45<28:29, 18.58s/it] {'loss': 1.0468, 'grad_norm': 19.045801162719727, 'learning_rate': 9.306563141162046e-06, 'ppl': 2.8485, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 404.94, 'total_tokens': 303849, 'epoch': 0.39}
	19%\|███████████████▍ \| 22/114 [07:45<28:29, 18.58s/it] 20%\|████████████████▏ \| 23/114 [08:03<27:58, 18.44s/it] {'loss': 0.9479, 'grad_norm': 4.227816581726074, 'learning_rate': 9.233620996141421e-06, 'ppl': 2.5803, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 399.93, 'total_tokens': 318222, 'epoch': 0.4}
	20%\|████████████████▏ \| 23/114 [08:03<27:58, 18.44s/it] 21%\|████████████████▊ \| 24/114 [08:21<27:18, 18.21s/it] {'loss': 0.8656, 'grad_norm': 4.401808261871338, 'learning_rate': 9.157348061512728e-06, 'ppl': 2.3764, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 424.57, 'total_tokens': 331525, 'epoch': 0.42}
	21%\|████████████████▊ \| 24/114 [08:21<27:18, 18.21s/it] 22%\|█████████████████▌ \| 25/114 [08:39<26:47, 18.06s/it] {'loss': 0.9866, 'grad_norm': 4.104758262634277, 'learning_rate': 9.077804344796302e-06, 'ppl': 2.6821, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 342.8, 'total_tokens': 344242, 'epoch': 0.44}
	22%\|█████████████████▌ \| 25/114 [08:39<26:47, 18.06s/it] 23%\|██████████████████▏ \| 26/114 [08:56<26:17, 17.92s/it] {'loss': 0.8213, 'grad_norm': 3.642549514770508, 'learning_rate': 8.995052426791247e-06, 'ppl': 2.2735, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 425.35, 'total_tokens': 358434, 'epoch': 0.46}
	23%\|██████████████████▏ \| 26/114 [08:56<26:17, 17.92s/it] 24%\|██████████████████▉ \| 27/114 [09:14<25:57, 17.91s/it] {'loss': 0.7607, 'grad_norm': 3.46269154548645, 'learning_rate': 8.90915741234015e-06, 'ppl': 2.1398, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.27, 'total_tokens': 373583, 'epoch': 0.47}
	24%\|██████████████████▉ \| 27/114 [09:14<25:57, 17.91s/it] 25%\|███████████████████▋ \| 28/114 [09:32<25:31, 17.80s/it] {'loss': 0.8874, 'grad_norm': 4.47441291809082, 'learning_rate': 8.820186879108038e-06, 'ppl': 2.4288, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.56, 'total_tokens': 388098, 'epoch': 0.49}
	25%\|███████████████████▋ \| 28/114 [09:32<25:31, 17.80s/it] 25%\|████████████████████▎ \| 29/114 [09:49<25:11, 17.79s/it] {'loss': 0.9208, 'grad_norm': 7.110525131225586, 'learning_rate': 8.728210824415829e-06, 'ppl': 2.5113, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 378.74, 'total_tokens': 401056, 'epoch': 0.51}
	25%\|████████████████████▎ \| 29/114 [09:49<25:11, 17.79s/it] 26%\|█████████████████████ \| 30/114 [10:07<24:54, 17.80s/it] {'loss': 0.9311, 'grad_norm': 4.498164176940918, 'learning_rate': 8.633301610170136e-06, 'ppl': 2.5373, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 382.0, 'total_tokens': 414830, 'epoch': 0.53}
	26%\|█████████████████████ \| 30/114 [10:07<24:54, 17.80s/it][2026-01-05 06:07:06,008] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:07:09,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8823564052581787
	[2026-01-05 06:07:11,720] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.9062902927398682
	[2026-01-05 06:07:13,619] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8988819122314453
	[2026-01-05 06:07:15,483] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.863325834274292
	[2026-01-05 06:07:15,484] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.28s/it][A
	[A{'eval_loss': 0.8130025267601013, 'eval_runtime': 9.0489, 'eval_samples_per_second': 2.1, 'eval_steps_per_second': 1.105, 'eval_ppl': 2.2547, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.53}
	26%\|█████████████████████ \| 30/114 [10:26<24:54, 17.80s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.28s/it][A
	[A 27%\|█████████████████████▊ \| 31/114 [10:43<32:09, 23.25s/it] {'loss': 0.9568, 'grad_norm': 4.3019700050354, 'learning_rate': 8.535533905932739e-06, 'ppl': 2.6034, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.97, 'total_tokens': 438063, 'epoch': 0.54}
	27%\|█████████████████████▊ \| 31/114 [10:43<32:09, 23.25s/it] 28%\|██████████████████████▍ \| 32/114 [11:01<29:36, 21.66s/it] {'loss': 1.4274, 'grad_norm': 13.445784568786621, 'learning_rate': 8.43498463017451e-06, 'ppl': 4.1678, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 411.95, 'total_tokens': 449755, 'epoch': 0.56}
	28%\|██████████████████████▍ \| 32/114 [11:01<29:36, 21.66s/it] 29%\|███████████████████████▏ \| 33/114 [11:19<27:33, 20.41s/it] {'loss': 0.9124, 'grad_norm': 4.726632595062256, 'learning_rate': 8.331732889760021e-06, 'ppl': 2.4903, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 430.35, 'total_tokens': 464296, 'epoch': 0.58}
	29%\|███████████████████████▏ \| 33/114 [11:19<27:33, 20.41s/it] 30%\|███████████████████████▊ \| 34/114 [11:36<26:05, 19.57s/it] {'loss': 0.9753, 'grad_norm': 4.2374067306518555, 'learning_rate': 8.22585991771044e-06, 'ppl': 2.652, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 386.8, 'total_tokens': 476505, 'epoch': 0.6}
	30%\|███████████████████████▊ \| 34/114 [11:36<26:05, 19.57s/it] 31%\|████████████████████████▌ \| 35/114 [11:54<25:01, 19.00s/it] {'loss': 0.9245, 'grad_norm': 3.9087975025177, 'learning_rate': 8.117449009293668e-06, 'ppl': 2.5206, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 364.2, 'total_tokens': 490425, 'epoch': 0.61}
	31%\|████████████████████████▌ \| 35/114 [11:54<25:01, 19.00s/it] 32%\|█████████████████████████▎ \| 36/114 [12:12<24:21, 18.73s/it] {'loss': 0.889, 'grad_norm': 4.065995216369629, 'learning_rate': 8.00658545649203e-06, 'ppl': 2.4327, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.93, 'total_tokens': 504789, 'epoch': 0.63}
	32%\|█████████████████████████▎ \| 36/114 [12:12<24:21, 18.73s/it] 32%\|█████████████████████████▉ \| 37/114 [12:30<23:38, 18.42s/it] {'loss': 0.8679, 'grad_norm': 4.762394428253174, 'learning_rate': 7.89335648089903e-06, 'ppl': 2.3819, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 403.62, 'total_tokens': 518553, 'epoch': 0.65}
	32%\|█████████████████████████▉ \| 37/114 [12:30<23:38, 18.42s/it] 33%\|██████████████████████████▋ \| 38/114 [12:48<23:08, 18.27s/it] {'loss': 1.0176, 'grad_norm': 15.393413543701172, 'learning_rate': 7.777851165098012e-06, 'ppl': 2.7665, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 294.78, 'total_tokens': 531189, 'epoch': 0.67}
	33%\|██████████████████████████▋ \| 38/114 [12:48<23:08, 18.27s/it] 34%\|███████████████████████████▎ \| 39/114 [13:05<22:33, 18.04s/it] {'loss': 0.9678, 'grad_norm': 3.8640310764312744, 'learning_rate': 7.660160382576683e-06, 'ppl': 2.6321, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.2, 'total_tokens': 544812, 'epoch': 0.68}
	34%\|███████████████████████████▎ \| 39/114 [13:05<22:33, 18.04s/it] 35%\|████████████████████████████ \| 40/114 [13:23<22:18, 18.09s/it] {'loss': 1.0927, 'grad_norm': 4.1391167640686035, 'learning_rate': 7.540376726232648e-06, 'ppl': 2.9823, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 353.51, 'total_tokens': 558027, 'epoch': 0.7}
	35%\|████████████████████████████ \| 40/114 [13:23<22:18, 18.09s/it] 36%\|████████████████████████████▊ \| 41/114 [13:41<21:45, 17.88s/it] {'loss': 0.8597, 'grad_norm': 3.4433345794677734, 'learning_rate': 7.4185944355261996e-06, 'ppl': 2.3625, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 410.17, 'total_tokens': 572138, 'epoch': 0.72}
	36%\|████████████████████████████▊ \| 41/114 [13:41<21:45, 17.88s/it] 37%\|█████████████████████████████▍ \| 42/114 [13:58<21:19, 17.77s/it] {'loss': 0.8931, 'grad_norm': 3.9547741413116455, 'learning_rate': 7.294909322337689e-06, 'ppl': 2.4427, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 374.06, 'total_tokens': 585392, 'epoch': 0.74}
	37%\|█████████████████████████████▍ \| 42/114 [13:58<21:19, 17.77s/it] 38%\|██████████████████████████████▏ \| 43/114 [14:16<21:04, 17.81s/it] {'loss': 0.9103, 'grad_norm': 7.8218255043029785, 'learning_rate': 7.169418695587791e-06, 'ppl': 2.4851, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.66, 'total_tokens': 599321, 'epoch': 0.75}
	38%\|██████████████████████████████▏ \| 43/114 [14:16<21:04, 17.81s/it] 39%\|██████████████████████████████▉ \| 44/114 [14:34<20:41, 17.73s/it] {'loss': 0.8742, 'grad_norm': 4.100659370422363, 'learning_rate': 7.042221284679982e-06, 'ppl': 2.397, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 277.15, 'total_tokens': 611687, 'epoch': 0.77}
	39%\|██████████████████████████████▉ \| 44/114 [14:34<20:41, 17.73s/it] 39%\|███████████████████████████████▌ \| 45/114 [14:52<20:32, 17.87s/it] {'loss': 1.0304, 'grad_norm': 4.246405124664307, 'learning_rate': 6.913417161825449e-06, 'ppl': 2.8022, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.16, 'total_tokens': 624725, 'epoch': 0.79}
	39%\|███████████████████████████████▌ \| 45/114 [14:52<20:32, 17.87s/it][2026-01-05 06:11:50,684] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:11:54,496] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8704819679260254
	[2026-01-05 06:11:56,390] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.893763780593872
	[2026-01-05 06:11:58,227] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.836214542388916
	[2026-01-05 06:12:00,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8623669147491455
	[2026-01-05 06:12:00,091] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.30s/it][A
	[A{'eval_loss': 0.8040304780006409, 'eval_runtime': 9.0899, 'eval_samples_per_second': 2.09, 'eval_steps_per_second': 1.1, 'eval_ppl': 2.2345, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.79}
	39%\|███████████████████████████████▌ \| 45/114 [15:10<20:32, 17.87s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.30s/it][A
	[A 40%\|████████████████████████████████▎ \| 46/114 [15:28<26:29, 23.37s/it] {'loss': 0.936, 'grad_norm': 4.026333332061768, 'learning_rate': 6.783107663311566e-06, 'ppl': 2.5498, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 425.2, 'total_tokens': 648077, 'epoch': 0.81}
	40%\|████████████████████████████████▎ \| 46/114 [15:28<26:29, 23.37s/it] 41%\|████████████████████████████████▉ \| 47/114 [15:46<24:11, 21.67s/it] {'loss': 0.893, 'grad_norm': 3.673527717590332, 'learning_rate': 6.651395309775837e-06, 'ppl': 2.4424, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 394.49, 'total_tokens': 662323, 'epoch': 0.82}
	41%\|████████████████████████████████▉ \| 47/114 [15:46<24:11, 21.67s/it] 42%\|█████████████████████████████████▋ \| 48/114 [16:03<22:26, 20.40s/it] {'loss': 1.0356, 'grad_norm': 6.087688446044922, 'learning_rate': 6.518383725548074e-06, 'ppl': 2.8168, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.91, 'total_tokens': 674394, 'epoch': 0.84}
	42%\|█████████████████████████████████▋ \| 48/114 [16:03<22:26, 20.40s/it] 43%\|██████████████████████████████████▍ \| 49/114 [16:21<21:16, 19.64s/it] {'loss': 0.8771, 'grad_norm': 3.8041579723358154, 'learning_rate': 6.384177557124247e-06, 'ppl': 2.4039, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 366.56, 'total_tokens': 687865, 'epoch': 0.86}
	43%\|██████████████████████████████████▍ \| 49/114 [16:21<21:16, 19.64s/it] 44%\|███████████████████████████████████ \| 50/114 [16:39<20:26, 19.16s/it] {'loss': 0.991, 'grad_norm': 3.9628713130950928, 'learning_rate': 6.248882390836135e-06, 'ppl': 2.6939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 383.67, 'total_tokens': 700828, 'epoch': 0.88}
	44%\|███████████████████████████████████ \| 50/114 [16:39<20:26, 19.16s/it] 45%\|███████████████████████████████████▊ \| 51/114 [16:57<19:36, 18.67s/it] {'loss': 0.8278, 'grad_norm': 3.998246669769287, 'learning_rate': 6.112604669781572e-06, 'ppl': 2.2883, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 396.65, 'total_tokens': 713732, 'epoch': 0.89}
	45%\|███████████████████████████████████▊ \| 51/114 [16:57<19:36, 18.67s/it] 46%\|████████████████████████████████████▍ \| 52/114 [17:14<18:54, 18.30s/it] {'loss': 0.8638, 'grad_norm': 4.033308506011963, 'learning_rate': 5.975451610080643e-06, 'ppl': 2.3722, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 336.52, 'total_tokens': 726692, 'epoch': 0.91}
	46%\|████████████████████████████████████▍ \| 52/114 [17:14<18:54, 18.30s/it] 46%\|█████████████████████████████████████▏ \| 53/114 [17:32<18:25, 18.13s/it] {'loss': 1.3544, 'grad_norm': 17.30064582824707, 'learning_rate': 5.837531116523683e-06, 'ppl': 3.8744, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 403.9, 'total_tokens': 739602, 'epoch': 0.93}
	46%\|█████████████████████████████████████▏ \| 53/114 [17:32<18:25, 18.13s/it] 47%\|█████████████████████████████████████▉ \| 54/114 [17:49<17:54, 17.92s/it] {'loss': 0.9146, 'grad_norm': 3.651843309402466, 'learning_rate': 5.698951697677498e-06, 'ppl': 2.4958, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 410.43, 'total_tokens': 753378, 'epoch': 0.95}
	47%\|█████████████████████████████████████▉ \| 54/114 [17:49<17:54, 17.92s/it] 48%\|██████████████████████████████████████▌ \| 55/114 [18:07<17:40, 17.97s/it] {'loss': 1.0104, 'grad_norm': 4.368696212768555, 'learning_rate': 5.559822380516539e-06, 'ppl': 2.7467, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 382.87, 'total_tokens': 767289, 'epoch': 0.96}
	48%\|██████████████████████████████████████▌ \| 55/114 [18:07<17:40, 17.97s/it] 49%\|███████████████████████████████████████▎ \| 56/114 [18:25<17:13, 17.81s/it] {'loss': 1.0087, 'grad_norm': 5.310736656188965, 'learning_rate': 5.420252624646238e-06, 'ppl': 2.742, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 311.44, 'total_tokens': 777836, 'epoch': 0.98}
	49%\|███████████████████████████████████████▎ \| 56/114 [18:25<17:13, 17.81s/it] 50%\|████████████████████████████████████████ \| 57/114 [18:44<17:18, 18.22s/it] {'loss': 0.8086, 'grad_norm': 4.207986831665039, 'learning_rate': 5.2803522361859596e-06, 'ppl': 2.2448, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 289.66, 'total_tokens': 788519, 'epoch': 1.0}
	50%\|████████████████████████████████████████ \| 57/114 [18:44<17:18, 18.22s/it][2026-01-05 06:15:42,787] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
	warnings.warn(

	[2026-01-05 06:16:05,005] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-57
	51%\|███████████████████████████████████████▋ \| 58/114 [21:43<1:02:04, 66.51s/it] {'loss': 0.6146, 'grad_norm': 3.4813573360443115, 'learning_rate': 5.140231281379345e-06, 'ppl': 1.8489, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 261.61, 'total_tokens': 801358, 'epoch': 1.02}
	51%\|███████████████████████████████████████▋ \| 58/114 [21:44<1:02:04, 66.51s/it] 52%\|█████████████████████████████████████████▍ \| 59/114 [22:03<48:05, 52.46s/it] {'loss': 0.6649, 'grad_norm': 3.9285855293273926, 'learning_rate': 5e-06, 'ppl': 1.9443, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 407.5, 'total_tokens': 814250, 'epoch': 1.04}
	52%\|█████████████████████████████████████████▍ \| 59/114 [22:03<48:05, 52.46s/it] 53%\|██████████████████████████████████████████ \| 60/114 [22:21<37:57, 42.17s/it] {'loss': 0.6454, 'grad_norm': 3.718013286590576, 'learning_rate': 4.859768718620656e-06, 'ppl': 1.9067, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 227.37, 'total_tokens': 825567, 'epoch': 1.05}
	53%\|██████████████████████████████████████████ \| 60/114 [22:21<37:57, 42.17s/it][2026-01-05 06:19:19,814] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:19:24,401] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.28102445602417
	[2026-01-05 06:19:26,656] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2544238567352295
	[2026-01-05 06:19:28,921] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2653965950012207
	[2026-01-05 06:19:31,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2563204765319824
	[2026-01-05 06:19:31,179] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.48s/it][A
	[A{'eval_loss': 0.7972212433815002, 'eval_runtime': 9.5865, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 1.043, 'eval_ppl': 2.2194, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.05}
	53%\|██████████████████████████████████████████ \| 60/114 [22:42<37:57, 42.17s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:07<00:00, 3.48s/it][A
	[A 54%\|██████████████████████████████████████████▊ \| 61/114 [22:59<36:17, 41.08s/it] {'loss': 0.6167, 'grad_norm': 5.092803001403809, 'learning_rate': 4.719647763814041e-06, 'ppl': 1.8528, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.69, 'total_tokens': 850896, 'epoch': 1.07}
	54%\|██████████████████████████████████████████▊ \| 61/114 [22:59<36:17, 41.08s/it] 54%\|███████████████████████████████████████████▌ \| 62/114 [23:17<29:34, 34.13s/it] {'loss': 0.4792, 'grad_norm': 5.132570743560791, 'learning_rate': 4.579747375353763e-06, 'ppl': 1.6148, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 378.34, 'total_tokens': 863508, 'epoch': 1.09}
	54%\|███████████████████████████████████████████▌ \| 62/114 [23:17<29:34, 34.13s/it] 55%\|████████████████████████████████████████████▏ \| 63/114 [23:35<24:42, 29.08s/it] {'loss': 0.4102, 'grad_norm': 4.274756908416748, 'learning_rate': 4.4401776194834615e-06, 'ppl': 1.5071, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.24, 'total_tokens': 876703, 'epoch': 1.11}
	55%\|████████████████████████████████████████████▏ \| 63/114 [23:35<24:42, 29.08s/it] 56%\|████████████████████████████████████████████▉ \| 64/114 [23:52<21:22, 25.64s/it] {'loss': 0.6118, 'grad_norm': 5.282613754272461, 'learning_rate': 4.3010483023225045e-06, 'ppl': 1.8437, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 293.01, 'total_tokens': 888584, 'epoch': 1.12}
	56%\|████████████████████████████████████████████▉ \| 64/114 [23:52<21:22, 25.64s/it] 57%\|█████████████████████████████████████████████▌ \| 65/114 [24:10<18:58, 23.23s/it] {'loss': 0.5366, 'grad_norm': 4.78963565826416, 'learning_rate': 4.162468883476319e-06, 'ppl': 1.7102, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 302.8, 'total_tokens': 900500, 'epoch': 1.14}
	57%\|█████████████████████████████████████████████▌ \| 65/114 [24:10<18:58, 23.23s/it] 58%\|██████████████████████████████████████████████▎ \| 66/114 [24:28<17:16, 21.59s/it] {'loss': 0.4277, 'grad_norm': 4.404544353485107, 'learning_rate': 4.02454838991936e-06, 'ppl': 1.5337, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.06, 'total_tokens': 914828, 'epoch': 1.16}
	58%\|██████████████████████████████████████████████▎ \| 66/114 [24:28<17:16, 21.59s/it] 59%\|███████████████████████████████████████████████ \| 67/114 [24:45<15:57, 20.37s/it] {'loss': 0.4992, 'grad_norm': 4.581356525421143, 'learning_rate': 3.887395330218429e-06, 'ppl': 1.6474, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 321.0, 'total_tokens': 927296, 'epoch': 1.18}
	59%\|███████████████████████████████████████████████ \| 67/114 [24:45<15:57, 20.37s/it] 60%\|███████████████████████████████████████████████▋ \| 68/114 [25:03<14:59, 19.56s/it] {'loss': 0.4236, 'grad_norm': 3.748277425765991, 'learning_rate': 3.751117609163865e-06, 'ppl': 1.5275, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.86, 'total_tokens': 940968, 'epoch': 1.19}
	60%\|███████████████████████████████████████████████▋ \| 68/114 [25:03<14:59, 19.56s/it] 61%\|████████████████████████████████████████████████▍ \| 69/114 [25:20<14:13, 18.98s/it] {'loss': 0.4085, 'grad_norm': 3.996558427810669, 'learning_rate': 3.6158224428757538e-06, 'ppl': 1.5046, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.06, 'total_tokens': 954163, 'epoch': 1.21}
	61%\|████████████████████████████████████████████████▍ \| 69/114 [25:20<14:13, 18.98s/it] 61%\|█████████████████████████████████████████████████ \| 70/114 [25:38<13:42, 18.69s/it] {'loss': 0.367, 'grad_norm': 3.7183310985565186, 'learning_rate': 3.4816162744519266e-06, 'ppl': 1.4434, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 392.87, 'total_tokens': 966263, 'epoch': 1.23}
	61%\|█████████████████████████████████████████████████ \| 70/114 [25:38<13:42, 18.69s/it] 62%\|█████████████████████████████████████████████████▊ \| 71/114 [25:56<13:06, 18.29s/it] {'loss': 0.3049, 'grad_norm': 3.2405660152435303, 'learning_rate': 3.3486046902241663e-06, 'ppl': 1.3565, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 320.24, 'total_tokens': 979257, 'epoch': 1.25}
	62%\|█████████████████████████████████████████████████▊ \| 71/114 [25:56<13:06, 18.29s/it] 63%\|██████████████████████████████████████████████████▌ \| 72/114 [26:13<12:34, 17.97s/it] {'loss': 0.4782, 'grad_norm': 3.7915146350860596, 'learning_rate': 3.216892336688435e-06, 'ppl': 1.6132, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 352.7, 'total_tokens': 991977, 'epoch': 1.26}
	63%\|██████████████████████████████████████████████████▌ \| 72/114 [26:13<12:34, 17.97s/it] 64%\|███████████████████████████████████████████████████▏ \| 73/114 [26:31<12:12, 17.88s/it] {'loss': 0.4014, 'grad_norm': 3.5778920650482178, 'learning_rate': 3.0865828381745515e-06, 'ppl': 1.4939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.21, 'total_tokens': 1005406, 'epoch': 1.28}
	64%\|███████████████████████████████████████████████████▏ \| 73/114 [26:31<12:12, 17.88s/it] 65%\|███████████████████████████████████████████████████▉ \| 74/114 [26:48<11:50, 17.77s/it] {'loss': 0.5709, 'grad_norm': 5.925182819366455, 'learning_rate': 2.95777871532002e-06, 'ppl': 1.7699, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 291.94, 'total_tokens': 1016138, 'epoch': 1.3}
	65%\|███████████████████████████████████████████████████▉ \| 74/114 [26:48<11:50, 17.77s/it] 66%\|████████████████████████████████████████████████████▋ \| 75/114 [27:06<11:34, 17.81s/it] {'loss': 0.4398, 'grad_norm': 4.066057205200195, 'learning_rate': 2.83058130441221e-06, 'ppl': 1.5524, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 397.09, 'total_tokens': 1029339, 'epoch': 1.32}
	66%\|████████████████████████████████████████████████████▋ \| 75/114 [27:06<11:34, 17.81s/it][2026-01-05 06:24:05,049] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:24:09,576] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2422144412994385
	[2026-01-05 06:24:11,841] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2649495601654053
	[2026-01-05 06:24:14,135] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2932779788970947
	[2026-01-05 06:24:16,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.286076068878174
	[2026-01-05 06:24:16,423] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.31s/it][A
	[A{'eval_loss': 0.8332963585853577, 'eval_runtime': 9.262, 'eval_samples_per_second': 2.051, 'eval_steps_per_second': 1.08, 'eval_ppl': 2.3009, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.32}
	66%\|████████████████████████████████████████████████████▋ \| 75/114 [27:27<11:34, 17.81s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.31s/it][A
	[A 67%\|█████████████████████████████████████████████████████▎ \| 76/114 [27:44<15:10, 23.95s/it] {'loss': 0.4122, 'grad_norm': 3.938491106033325, 'learning_rate': 2.705090677662311e-06, 'ppl': 1.5101, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 341.7, 'total_tokens': 1052283, 'epoch': 1.33}
	67%\|█████████████████████████████████████████████████████▎ \| 76/114 [27:44<15:10, 23.95s/it] 68%\|██████████████████████████████████████████████████████ \| 77/114 [28:02<13:38, 22.11s/it] {'loss': 0.4916, 'grad_norm': 4.054359436035156, 'learning_rate': 2.5814055644738013e-06, 'ppl': 1.6349, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 417.0, 'total_tokens': 1065477, 'epoch': 1.35}
	68%\|██████████████████████████████████████████████████████ \| 77/114 [28:02<13:38, 22.11s/it] 68%\|██████████████████████████████████████████████████████▋ \| 78/114 [28:20<12:24, 20.69s/it] {'loss': 0.3917, 'grad_norm': 4.133195877075195, 'learning_rate': 2.4596232737673544e-06, 'ppl': 1.4795, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 373.89, 'total_tokens': 1078206, 'epoch': 1.37}
	68%\|██████████████████████████████████████████████████████▋ \| 78/114 [28:20<12:24, 20.69s/it] 69%\|███████████████████████████████████████████████████████▍ \| 79/114 [28:38<11:36, 19.90s/it] {'loss': 0.8396, 'grad_norm': 6.9484052658081055, 'learning_rate': 2.339839617423318e-06, 'ppl': 2.3154, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.68, 'total_tokens': 1092368, 'epoch': 1.39}
	69%\|███████████████████████████████████████████████████████▍ \| 79/114 [28:38<11:36, 19.90s/it] 70%\|████████████████████████████████████████████████████████▏ \| 80/114 [28:55<10:54, 19.24s/it] {'loss': 0.5273, 'grad_norm': 5.065030574798584, 'learning_rate': 2.2221488349019903e-06, 'ppl': 1.6944, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 409.1, 'total_tokens': 1106741, 'epoch': 1.4}
	70%\|████████████████████████████████████████████████████████▏ \| 80/114 [28:55<10:54, 19.24s/it] 71%\|████████████████████████████████████████████████████████▊ \| 81/114 [29:13<10:16, 18.70s/it] {'loss': 0.3963, 'grad_norm': 4.900491714477539, 'learning_rate': 2.1066435191009717e-06, 'ppl': 1.4863, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 430.18, 'total_tokens': 1120044, 'epoch': 1.42}
	71%\|████████████████████████████████████████████████████████▊ \| 81/114 [29:13<10:16, 18.70s/it] 72%\|█████████████████████████████████████████████████████████▌ \| 82/114 [29:30<09:44, 18.27s/it] {'loss': 0.4278, 'grad_norm': 3.917367458343506, 'learning_rate': 1.9934145435079705e-06, 'ppl': 1.5339, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.67, 'total_tokens': 1132761, 'epoch': 1.44}
	72%\|█████████████████████████████████████████████████████████▌ \| 82/114 [29:30<09:44, 18.27s/it] 73%\|██████████████████████████████████████████████████████████▏ \| 83/114 [29:48<09:20, 18.08s/it] {'loss': 0.3575, 'grad_norm': 3.7160727977752686, 'learning_rate': 1.8825509907063328e-06, 'ppl': 1.4298, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.13, 'total_tokens': 1146953, 'epoch': 1.46}
	73%\|██████████████████████████████████████████████████████████▏ \| 83/114 [29:48<09:20, 18.08s/it] 74%\|██████████████████████████████████████████████████████████▉ \| 84/114 [30:05<08:56, 17.89s/it] {'loss': 0.2989, 'grad_norm': 3.235100507736206, 'learning_rate': 1.7741400822895633e-06, 'ppl': 1.3484, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 435.06, 'total_tokens': 1162102, 'epoch': 1.47}
	74%\|██████████████████████████████████████████████████████████▉ \| 84/114 [30:05<08:56, 17.89s/it] 75%\|███████████████████████████████████████████████████████████▋ \| 85/114 [30:23<08:37, 17.83s/it] {'loss': 0.4173, 'grad_norm': 3.7896361351013184, 'learning_rate': 1.6682671102399806e-06, 'ppl': 1.5179, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 414.25, 'total_tokens': 1176617, 'epoch': 1.49}
	75%\|███████████████████████████████████████████████████████████▋ \| 85/114 [30:23<08:37, 17.83s/it] 75%\|████████████████████████████████████████████████████████████▎ \| 86/114 [30:40<08:15, 17.70s/it] {'loss': 0.5225, 'grad_norm': 3.7193076610565186, 'learning_rate': 1.5650153698254916e-06, 'ppl': 1.6862, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 386.25, 'total_tokens': 1189575, 'epoch': 1.51}
	75%\|████████████████████████████████████████████████████████████▎ \| 86/114 [30:40<08:15, 17.70s/it] 76%\|█████████████████████████████████████████████████████████████ \| 87/114 [30:58<08:01, 17.82s/it] {'loss': 0.4523, 'grad_norm': 3.9064459800720215, 'learning_rate': 1.4644660940672628e-06, 'ppl': 1.5719, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.18, 'total_tokens': 1203349, 'epoch': 1.53}
	76%\|█████████████████████████████████████████████████████████████ \| 87/114 [30:58<08:01, 17.82s/it] 77%\|█████████████████████████████████████████████████████████████▊ \| 88/114 [31:16<07:39, 17.68s/it] {'loss': 0.4023, 'grad_norm': 3.633103847503662, 'learning_rate': 1.3666983898298659e-06, 'ppl': 1.4953, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.67, 'total_tokens': 1215720, 'epoch': 1.54}
	77%\|█████████████████████████████████████████████████████████████▊ \| 88/114 [31:16<07:39, 17.68s/it] 78%\|██████████████████████████████████████████████████████████████▍ \| 89/114 [31:33<07:21, 17.67s/it] {'loss': 1.3284, 'grad_norm': 9.749296188354492, 'learning_rate': 1.2717891755841722e-06, 'ppl': 3.775, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 409.51, 'total_tokens': 1227412, 'epoch': 1.56}
	78%\|██████████████████████████████████████████████████████████████▍ \| 89/114 [31:33<07:21, 17.67s/it] 79%\|███████████████████████████████████████████████████████████████▏ \| 90/114 [31:51<07:06, 17.78s/it] {'loss': 0.4467, 'grad_norm': 3.298785448074341, 'learning_rate': 1.1798131208919628e-06, 'ppl': 1.5631, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.1, 'total_tokens': 1241953, 'epoch': 1.58}
	79%\|███████████████████████████████████████████████████████████████▏ \| 90/114 [31:51<07:06, 17.78s/it][2026-01-05 06:28:50,293] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:28:54,407] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0476205348968506
	[2026-01-05 06:28:56,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0721523761749268
	[2026-01-05 06:28:58,500] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.020061492919922
	[2026-01-05 06:29:00,540] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0398201942443848
	[2026-01-05 06:29:00,623] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.30s/it][A
	[A{'eval_loss': 0.813373863697052, 'eval_runtime': 9.1778, 'eval_samples_per_second': 2.07, 'eval_steps_per_second': 1.09, 'eval_ppl': 2.2555, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.58}
	79%\|███████████████████████████████████████████████████████████████▏ \| 90/114 [32:11<07:06, 17.78s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.30s/it][A
	[A 80%\|███████████████████████████████████████████████████████████████▊ \| 91/114 [32:28<09:02, 23.57s/it] {'loss': 0.4378, 'grad_norm': 3.9499611854553223, 'learning_rate': 1.0908425876598512e-06, 'ppl': 1.5493, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 388.15, 'total_tokens': 1265024, 'epoch': 1.6}
	80%\|███████████████████████████████████████████████████████████████▊ \| 91/114 [32:28<09:02, 23.57s/it] 81%\|████████████████████████████████████████████████████████████████▌ \| 92/114 [32:46<08:01, 21.87s/it] {'loss': 0.4448, 'grad_norm': 4.025573253631592, 'learning_rate': 1.004947573208756e-06, 'ppl': 1.5602, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.45, 'total_tokens': 1278944, 'epoch': 1.61}
	81%\|████████████████████████████████████████████████████████████████▌ \| 92/114 [32:46<08:01, 21.87s/it] 82%\|█████████████████████████████████████████████████████████████████▎ \| 93/114 [33:04<07:11, 20.52s/it] {'loss': 0.4514, 'grad_norm': 4.208862781524658, 'learning_rate': 9.221956552036992e-07, 'ppl': 1.5705, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 415.53, 'total_tokens': 1293308, 'epoch': 1.63}
	82%\|█████████████████████████████████████████████████████████████████▎ \| 93/114 [33:04<07:11, 20.52s/it] 82%\|█████████████████████████████████████████████████████████████████▉ \| 94/114 [33:21<06:33, 19.69s/it] {'loss': 0.4805, 'grad_norm': 5.097556114196777, 'learning_rate': 8.426519384872733e-07, 'ppl': 1.6169, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 402.99, 'total_tokens': 1307072, 'epoch': 1.65}
	82%\|█████████████████████████████████████████████████████████████████▉ \| 94/114 [33:22<06:33, 19.69s/it] 83%\|██████████████████████████████████████████████████████████████████▋ \| 95/114 [33:39<06:01, 19.05s/it] {'loss': 0.7964, 'grad_norm': 5.547924995422363, 'learning_rate': 7.663790038585794e-07, 'ppl': 2.2175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 300.56, 'total_tokens': 1319708, 'epoch': 1.67}
	83%\|██████████████████████████████████████████████████████████████████▋ \| 95/114 [33:39<06:01, 19.05s/it] 84%\|███████████████████████████████████████████████████████████████████▎ \| 96/114 [33:57<05:36, 18.69s/it] {'loss': 0.4699, 'grad_norm': 3.6507017612457275, 'learning_rate': 6.934368588379553e-07, 'ppl': 1.5998, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 388.97, 'total_tokens': 1333331, 'epoch': 1.68}
	84%\|███████████████████████████████████████████████████████████████████▎ \| 96/114 [33:57<05:36, 18.69s/it] 85%\|████████████████████████████████████████████████████████████████████ \| 97/114 [34:14<05:10, 18.29s/it] {'loss': 0.6074, 'grad_norm': 4.161734104156494, 'learning_rate': 6.238828904562316e-07, 'ppl': 1.8357, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 370.47, 'total_tokens': 1346546, 'epoch': 1.7}
	85%\|████████████████████████████████████████████████████████████████████ \| 97/114 [34:14<05:10, 18.29s/it] 86%\|████████████████████████████████████████████████████████████████████▊ \| 98/114 [34:32<04:50, 18.14s/it] {'loss': 0.4091, 'grad_norm': 3.1512742042541504, 'learning_rate': 5.577718201056392e-07, 'ppl': 1.5055, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 400.58, 'total_tokens': 1360657, 'epoch': 1.72}
	86%\|████████████████████████████████████████████████████████████████████▊ \| 98/114 [34:32<04:50, 18.14s/it] 87%\|█████████████████████████████████████████████████████████████████████▍ \| 99/114 [34:49<04:28, 17.90s/it] {'loss': 0.4531, 'grad_norm': 3.652284860610962, 'learning_rate': 4.951556604879049e-07, 'ppl': 1.5732, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 378.33, 'total_tokens': 1373911, 'epoch': 1.74}
	87%\|█████████████████████████████████████████████████████████████████████▍ \| 99/114 [34:49<04:28, 17.90s/it] 88%\|█████████████████████████████████████████████████████████████████████▎ \| 100/114 [35:08<04:13, 18.13s/it] {'loss': 0.6774, 'grad_norm': 13.29174518585205, 'learning_rate': 4.3608367469340553e-07, 'ppl': 1.9688, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 368.21, 'total_tokens': 1387840, 'epoch': 1.75}
	88%\|█████████████████████████████████████████████████████████████████████▎ \| 100/114 [35:08<04:13, 18.13s/it] 89%\|█████████████████████████████████████████████████████████████████████▉ \| 101/114 [35:26<03:55, 18.12s/it] {'loss': 0.4309, 'grad_norm': 3.5832619667053223, 'learning_rate': 3.8060233744356634e-07, 'ppl': 1.5386, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 268.73, 'total_tokens': 1400206, 'epoch': 1.77}
	89%\|█████████████████████████████████████████████████████████████████████▉ \| 101/114 [35:26<03:55, 18.12s/it] 89%\|██████████████████████████████████████████████████████████████████████▋ \| 102/114 [35:44<03:36, 18.05s/it] {'loss': 0.5536, 'grad_norm': 3.793043375015259, 'learning_rate': 3.287552985270015e-07, 'ppl': 1.7395, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.95, 'total_tokens': 1413244, 'epoch': 1.79}
	89%\|██████████████████████████████████████████████████████████████████████▋ \| 102/114 [35:44<03:36, 18.05s/it] 90%\|███████████████████████████████████████████████████████████████████████▍ \| 103/114 [36:02<03:17, 17.95s/it] {'loss': 0.5032, 'grad_norm': 3.7566542625427246, 'learning_rate': 2.8058334845816214e-07, 'ppl': 1.654, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 425.22, 'total_tokens': 1425734, 'epoch': 1.81}
	90%\|███████████████████████████████████████████████████████████████████████▍ \| 103/114 [36:02<03:17, 17.95s/it] 91%\|████████████████████████████████████████████████████████████████████████ \| 104/114 [36:19<02:58, 17.83s/it] {'loss': 0.4549, 'grad_norm': 3.206256866455078, 'learning_rate': 2.3612438638551837e-07, 'ppl': 1.576, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.45, 'total_tokens': 1439980, 'epoch': 1.82}
	91%\|████████████████████████████████████████████████████████████████████████ \| 104/114 [36:19<02:58, 17.83s/it] 92%\|████████████████████████████████████████████████████████████████████████▊ \| 105/114 [36:37<02:41, 17.89s/it] {'loss': 0.6494, 'grad_norm': 4.32829475402832, 'learning_rate': 1.9541339027450256e-07, 'ppl': 1.9144, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 340.64, 'total_tokens': 1452051, 'epoch': 1.84}
	92%\|████████████████████████████████████████████████████████████████████████▊ \| 105/114 [36:37<02:41, 17.89s/it][2026-01-05 06:33:36,260] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:505777] Running evaluation step...
	[2026-01-05 06:33:40,708] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1876299381256104
	[2026-01-05 06:33:42,931] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2227494716644287
	[2026-01-05 06:33:45,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2258124351501465
	[2026-01-05 06:33:47,352] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1935596466064453
	[2026-01-05 06:33:47,353] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5]

	0%\| \| 0/2 [00:00<?, ?it/s][A
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.28s/it][A
	[A{'eval_loss': 0.8144508004188538, 'eval_runtime': 9.188, 'eval_samples_per_second': 2.068, 'eval_steps_per_second': 1.088, 'eval_ppl': 2.2579, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.84}
	92%\|████████████████████████████████████████████████████████████████████████▊ \| 105/114 [36:58<02:41, 17.89s/it]
	100%\|███████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:06<00:00, 3.28s/it][A
	[A 93%\|█████████████████████████████████████████████████████████████████████████▍ \| 106/114 [37:15<03:11, 23.93s/it] {'loss': 0.4638, 'grad_norm': 3.6060004234313965, 'learning_rate': 1.5848238938869332e-07, 'ppl': 1.5901, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.31, 'total_tokens': 1476384, 'epoch': 1.86}
	93%\|█████████████████████████████████████████████████████████████████████████▍ \| 106/114 [37:15<03:11, 23.93s/it] 94%\|██████████████████████████████████████████████████████████████████████████▏ \| 107/114 [37:33<02:35, 22.17s/it] {'loss': 0.5655, 'grad_norm': 3.643385648727417, 'learning_rate': 1.253604390908819e-07, 'ppl': 1.7603, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 392.59, 'total_tokens': 1489347, 'epoch': 1.88}
	94%\|██████████████████████████████████████████████████████████████████████████▏ \| 107/114 [37:33<02:35, 22.17s/it] 95%\|██████████████████████████████████████████████████████████████████████████▊ \| 108/114 [37:51<02:04, 20.80s/it] {'loss': 0.4432, 'grad_norm': 3.812006711959839, 'learning_rate': 9.607359798384785e-08, 'ppl': 1.5577, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 395.61, 'total_tokens': 1502251, 'epoch': 1.89}
	95%\|██████████████████████████████████████████████████████████████████████████▊ \| 108/114 [37:51<02:04, 20.80s/it] 96%\|███████████████████████████████████████████████████████████████████████████▌ \| 109/114 [38:09<01:39, 19.86s/it] {'loss': 0.4601, 'grad_norm': 3.268007516860962, 'learning_rate': 7.064490740882057e-08, 'ppl': 1.5842, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 331.97, 'total_tokens': 1515211, 'epoch': 1.91}
	96%\|███████████████████████████████████████████████████████████████████████████▌ \| 109/114 [38:09<01:39, 19.86s/it] 96%\|████████████████████████████████████████████████████████████████████████████▏ \| 110/114 [38:27<01:17, 19.27s/it] {'loss': 1.1937, 'grad_norm': 17.401044845581055, 'learning_rate': 4.909437331777178e-08, 'ppl': 3.2993, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.17, 'total_tokens': 1528121, 'epoch': 1.93}
	96%\|████████████████████████████████████████████████████████████████████████████▏ \| 110/114 [38:27<01:17, 19.27s/it] 97%\|████████████████████████████████████████████████████████████████████████████▉ \| 111/114 [38:44<00:56, 18.68s/it] {'loss': 0.5327, 'grad_norm': 3.476621627807617, 'learning_rate': 3.143895053378698e-08, 'ppl': 1.7035, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 412.86, 'total_tokens': 1541897, 'epoch': 1.95}
	97%\|████████████████████████████████████████████████████████████████████████████▉ \| 111/114 [38:44<00:56, 18.68s/it] 98%\|█████████████████████████████████████████████████████████████████████████████▌ \| 112/114 [39:01<00:36, 18.32s/it] {'loss': 0.6603, 'grad_norm': 5.652373313903809, 'learning_rate': 1.769252941190458e-08, 'ppl': 1.9354, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.43, 'total_tokens': 1555808, 'epoch': 1.96}
	98%\|█████████████████████████████████████████████████████████████████████████████▌ \| 112/114 [39:01<00:36, 18.32s/it] 99%\|██████████████████████████████████████████████████████████████████████████████▎\| 113/114 [39:19<00:18, 18.10s/it] {'loss': 0.6319, 'grad_norm': 4.798173427581787, 'learning_rate': 7.865924910916977e-09, 'ppl': 1.8812, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 309.08, 'total_tokens': 1566355, 'epoch': 1.98}
	99%\|██████████████████████████████████████████████████████████████████████████████▎\| 113/114 [39:19<00:18, 18.10s/it] 100%\|███████████████████████████████████████████████████████████████████████████████\| 114/114 [39:39<00:00, 18.62s/it] {'loss': 0.4319, 'grad_norm': 3.836909055709839, 'learning_rate': 1.9668680847356735e-09, 'ppl': 1.5402, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 292.8, 'total_tokens': 1577038, 'epoch': 2.0}
	100%\|███████████████████████████████████████████████████████████████████████████████\| 114/114 [39:39<00:00, 18.62s/it][2026-01-05 06:36:37,660] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
	warnings.warn(

	[2026-01-05 06:36:57,988] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-114
	{'train_runtime': 2526.3299, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.045, 'train_loss': 0.76008216357022, 'memory/max_active (GiB)': 9.02, 'memory/max_allocated (GiB)': 9.02, 'memory/device_reserved (GiB)': 9.82, 'epoch': 2.0}
	100%\|███████████████████████████████████████████████████████████████████████████████\| 114/114 [42:03<00:00, 18.62s/it] 100%\|███████████████████████████████████████████████████████████████████████████████\| 114/114 [42:03<00:00, 22.13s/it]
	[2026-01-05 06:39:01,711] [INFO] [axolotl.train.save_trained_model:233] [PID:505777] Training completed! Saving trained model to loopstral-second-test/stage-3-healed.
	[2026-01-05 06:39:01,713] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
	warnings.warn(

	[2026-01-05 06:39:23,082] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed
	[2026-01-05 06:39:47,279] [INFO] [axolotl.train.save_trained_model:351] [PID:505777] Model successfully saved to loopstral-second-test/stage-3-healed
	[0m