azeddinShr's picture
Complete Spark-TTS with Arabic fine-tuned LLM
03401b7 verified
[2025-12-16 09:55:50,079] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25550] baseline 0.000GB ()
[2025-12-16 09:55:50,081] [INFO] [axolotl.cli.config.load_cfg:256] [PID:25550] config:
{
"activation_offloading": false,
"axolotl_config_path": "config_axolotl/full_finetune.yml",
"base_model": "/content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM",
"base_model_config": "/content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM",
"batch_size": 8,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1
},
"context_parallel_size": 1,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 12,
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "/content/processed_output/clartts_data.jsonl",
"trust_remote_code": false,
"type": "completion"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.7.1"
},
"eval_batch_size": 1,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": false,
"eval_steps": 0.3333333333333333,
"eval_table_size": 0,
"evals_per_epoch": 1,
"experimental_skip_move_to_device": true,
"flash_attention": false,
"fp16": false,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"group_by_length": false,
"include_tkps": true,
"learning_rate": 0.0002,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 50,
"lora_dropout": 0.0,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"micro_batch_size": 1,
"model_config_type": "qwen2",
"num_epochs": 3.0,
"optimizer": "adamw_torch_fused",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "/content/finetuned_model",
"pad_to_sequence_len": true,
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": false,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 200,
"sequence_len": 1024,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": false,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "/content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"trust_remote_code": true,
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.05,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_steps": 10,
"weight_decay": 0.0,
"world_size": 1
}
[2025-12-16 09:55:50,084] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:25550] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
[2025-12-16 09:55:52,740] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:25550] EOS: 151645 / <|im_end|>
[2025-12-16 09:55:52,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:25550] BOS: None / None
[2025-12-16 09:55:52,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:25550] PAD: 151643 / <|endoftext|>
[2025-12-16 09:55:52,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:25550] UNK: None / None
[2025-12-16 09:55:52,742] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:25550] Unable to find prepared dataset in last_run_prepared/77e1f6e06cc86d297d78dc02a7c9ecb5
[2025-12-16 09:55:52,742] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:25550] Loading raw datasets...
[2025-12-16 09:55:52,742] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:25550] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2025-12-16 09:55:53,454] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:25550] Loading dataset: /content/processed_output/clartts_data.jsonl with base_type: completion and prompt_style: None
[2025-12-16 09:55:53,620] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:25550] min_input_len: 147
[2025-12-16 09:55:53,620] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:25550] max_input_len: 824
Dropping Long Sequences (>1024) (num_proc=12): 0% 0/2850 [00:00<?, ? examples/s] Dropping Long Sequences (>1024) (num_proc=12): 8% 238/2850 [00:00<00:04, 616.00 examples/s] Dropping Long Sequences (>1024) (num_proc=12): 100% 2850/2850 [00:00<00:00, 5142.08 examples/s]
Saving the dataset (0/11 shards): 0% 0/2850 [00:00<?, ? examples/s] Saving the dataset (0/11 shards): 9% 260/2850 [00:00<00:02, 883.44 examples/s] Saving the dataset (1/11 shards): 9% 260/2850 [00:00<00:02, 883.44 examples/s] Saving the dataset (2/11 shards): 18% 519/2850 [00:00<00:02, 883.44 examples/s] Saving the dataset (3/11 shards): 27% 778/2850 [00:00<00:02, 883.44 examples/s] Saving the dataset (4/11 shards): 36% 1037/2850 [00:00<00:02, 883.44 examples/s] Saving the dataset (5/11 shards): 45% 1296/2850 [00:00<00:01, 883.44 examples/s] Saving the dataset (6/11 shards): 55% 1555/2850 [00:00<00:01, 883.44 examples/s] Saving the dataset (7/11 shards): 64% 1814/2850 [00:00<00:01, 883.44 examples/s] Saving the dataset (8/11 shards): 73% 2073/2850 [00:00<00:00, 883.44 examples/s] Saving the dataset (9/11 shards): 82% 2332/2850 [00:00<00:00, 883.44 examples/s] Saving the dataset (10/11 shards): 91% 2591/2850 [00:00<00:00, 883.44 examples/s] Saving the dataset (11/11 shards): 100% 2850/2850 [00:00<00:00, 883.44 examples/s] Saving the dataset (11/11 shards): 100% 2850/2850 [00:00<00:00, 7206.73 examples/s]
[2025-12-16 09:55:54,667] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:25550] total_num_tokens: 831_053
[2025-12-16 09:55:54,691] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:25550] `total_supervised_tokens: 874_475`
[2025-12-16 09:55:54,692] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:25550] total_num_steps: 1016
[2025-12-16 09:55:54,692] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:25550] Maximum number of steps set at 1016
[2025-12-16 09:55:54,741] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:25550] loading tokenizer... /content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM
[2025-12-16 09:55:57,322] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:25550] EOS: 151645 / <|im_end|>
[2025-12-16 09:55:57,322] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:25550] BOS: None / None
[2025-12-16 09:55:57,322] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:25550] PAD: 151643 / <|endoftext|>
[2025-12-16 09:55:57,322] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:25550] UNK: None / None
[2025-12-16 09:55:57,322] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:25550] Loading model
[2025-12-16 09:55:57,332] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:25550] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-12-16 09:55:57,333] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:25550] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-12-16 09:55:58,630] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:25550] Memory usage after model load 0.000GB ()
[2025-12-16 09:56:07,010] [INFO] [axolotl.train.save_initial_configs:417] [PID:25550] Pre-saving tokenizer to /content/finetuned_model...
[2025-12-16 09:56:07,362] [INFO] [axolotl.train.save_initial_configs:422] [PID:25550] Pre-saving model config to /content/finetuned_model...
[2025-12-16 09:56:07,365] [INFO] [axolotl.train.execute_training:212] [PID:25550] Starting trainer...
0% 0/1016 [00:00<?, ?it/s][2025-12-16 09:56:07,903] [INFO] [axolotl.core.trainers.base.evaluate:387] [PID:25550] Running evaluation step...
0% 0/143 [00:00<?, ?it/s]
2% 3/143 [00:00<00:06, 21.93it/s]
4% 6/143 [00:00<00:06, 22.03it/s]
6% 9/143 [00:00<00:06, 21.23it/s]
8% 12/143 [00:00<00:06, 21.70it/s]
10% 15/143 [00:00<00:05, 21.90it/s]
13% 18/143 [00:00<00:05, 21.83it/s]
15% 21/143 [00:00<00:05, 22.00it/s]
17% 24/143 [00:01<00:05, 22.17it/s]
19% 27/143 [00:01<00:05, 22.10it/s]
21% 30/143 [00:01<00:05, 22.25it/s]
23% 33/143 [00:01<00:05, 21.48it/s]
25% 36/143 [00:01<00:04, 21.72it/s]
27% 39/143 [00:01<00:04, 21.93it/s]
29% 42/143 [00:01<00:04, 21.90it/s]
31% 45/143 [00:02<00:04, 21.90it/s]
34% 48/143 [00:02<00:04, 22.12it/s]
36% 51/143 [00:02<00:04, 21.99it/s]
38% 54/143 [00:02<00:04, 22.11it/s]
40% 57/143 [00:02<00:03, 21.97it/s]
42% 60/143 [00:02<00:03, 22.07it/s]
44% 63/143 [00:02<00:03, 21.88it/s]
46% 66/143 [00:03<00:03, 21.94it/s]
48% 69/143 [00:03<00:03, 22.17it/s]
50% 72/143 [00:03<00:03, 22.27it/s]
52% 75/143 [00:03<00:03, 22.12it/s]
55% 78/143 [00:03<00:02, 22.23it/s]
57% 81/143 [00:03<00:02, 22.11it/s]
59% 84/143 [00:03<00:02, 22.24it/s]
61% 87/143 [00:03<00:02, 22.33it/s]
63% 90/143 [00:04<00:02, 22.28it/s]
65% 93/143 [00:04<00:02, 22.43it/s]
67% 96/143 [00:04<00:02, 22.52it/s]
69% 99/143 [00:04<00:01, 22.42it/s]
71% 102/143 [00:04<00:01, 22.59it/s]
73% 105/143 [00:04<00:01, 22.49it/s]
76% 108/143 [00:04<00:01, 22.53it/s]
78% 111/143 [00:05<00:01, 22.69it/s]
80% 114/143 [00:05<00:01, 22.55it/s]
82% 117/143 [00:05<00:01, 22.63it/s]
84% 120/143 [00:05<00:01, 22.73it/s]
86% 123/143 [00:05<00:00, 22.61it/s]
88% 126/143 [00:05<00:00, 22.70it/s]
90% 129/143 [00:05<00:00, 22.60it/s]
92% 132/143 [00:05<00:00, 22.76it/s]
94% 135/143 [00:06<00:00, 22.83it/s]
97% 138/143 [00:06<00:00, 22.67it/s]
99% 141/143 [00:06<00:00, 22.80it/s]
{'eval_loss': 11.850272178649902, 'eval_runtime': 7.1659, 'eval_samples_per_second': 19.956, 'eval_steps_per_second': 19.956, 'memory/max_active (GiB)': 3.1, 'memory/max_allocated (GiB)': 3.1, 'memory/device_reserved (GiB)': 3.2, 'epoch': 0}
0% 0/1016 [00:07<?, ?it/s]
100% 143/143 [00:06<00:00, 22.80it/s]
 0% 1/1016 [00:09<2:44:40, 9.73s/it] 0% 2/1016 [00:11<1:23:54, 4.97s/it] 0% 3/1016 [00:12<57:59, 3.43s/it] 0% 4/1016 [00:14<45:34, 2.70s/it] 0% 5/1016 [00:16<38:58, 2.31s/it] 1% 6/1016 [00:17<34:51, 2.07s/it] 1% 7/1016 [00:19<32:14, 1.92s/it] 1% 8/1016 [00:20<30:34, 1.82s/it] 1% 9/1016 [00:22<29:27, 1.75s/it] 1% 10/1016 [00:24<28:47, 1.72s/it] 1% 11/1016 [00:25<28:16, 1.69s/it] 1% 12/1016 [00:27<27:54, 1.67s/it] 1% 13/1016 [00:29<27:36, 1.65s/it] 1% 14/1016 [00:30<27:17, 1.63s/it] 1% 15/1016 [00:32<27:10, 1.63s/it] 2% 16/1016 [00:33<27:14, 1.63s/it] 2% 17/1016 [00:36<30:54, 1.86s/it] 2% 18/1016 [00:38<30:12, 1.82s/it] 2% 19/1016 [00:39<29:59, 1.81s/it] 2% 20/1016 [00:41<29:48, 1.80s/it] 2% 21/1016 [00:43<29:11, 1.76s/it] 2% 22/1016 [00:44<28:32, 1.72s/it] 2% 23/1016 [00:46<27:59, 1.69s/it] 2% 24/1016 [00:48<27:46, 1.68s/it] 2% 25/1016 [00:49<27:28, 1.66s/it] 3% 26/1016 [00:51<27:15, 1.65s/it] 3% 27/1016 [00:53<27:03, 1.64s/it] 3% 28/1016 [00:54<26:58, 1.64s/it] 3% 29/1016 [00:56<26:51, 1.63s/it] 3% 30/1016 [00:57<26:56, 1.64s/it] 3% 31/1016 [00:59<26:51, 1.64s/it] 3% 32/1016 [01:01<26:39, 1.63s/it] 3% 33/1016 [01:02<26:32, 1.62s/it] 3% 34/1016 [01:04<26:35, 1.63s/it] 3% 35/1016 [01:06<26:31, 1.62s/it] 4% 36/1016 [01:07<26:26, 1.62s/it] 4% 37/1016 [01:09<26:35, 1.63s/it] 4% 38/1016 [01:10<26:36, 1.63s/it] 4% 39/1016 [01:12<26:27, 1.62s/it] 4% 40/1016 [01:14<26:25, 1.62s/it] 4% 41/1016 [01:15<26:20, 1.62s/it] 4% 42/1016 [01:17<26:15, 1.62s/it] 4% 43/1016 [01:19<28:40, 1.77s/it] 4% 44/1016 [01:21<28:09, 1.74s/it] 4% 45/1016 [01:22<27:38, 1.71s/it] 5% 46/1016 [01:24<27:13, 1.68s/it] 5% 47/1016 [01:26<26:56, 1.67s/it] 5% 48/1016 [01:27<26:41, 1.65s/it] 5% 49/1016 [01:29<26:26, 1.64s/it] 5% 50/1016 [01:30<26:09, 1.62s/it] {'loss': 7.2742, 'grad_norm': 9.9116792678833, 'learning_rate': 0.00019925925947187668, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 2082.96, 'total_tokens': 164572, 'epoch': 0.15}
5% 50/1016 [01:30<26:09, 1.62s/it] 5% 51/1016 [01:32<26:00, 1.62s/it] 5% 52/1016 [01:34<26:07, 1.63s/it] 5% 53/1016 [01:35<26:01, 1.62s/it] 5% 54/1016 [01:37<25:57, 1.62s/it] 5% 55/1016 [01:38<25:53, 1.62s/it] 6% 56/1016 [01:40<25:55, 1.62s/it] 6% 57/1016 [01:42<25:50, 1.62s/it] 6% 58/1016 [01:43<25:42, 1.61s/it] 6% 59/1016 [01:45<25:47, 1.62s/it] 6% 60/1016 [01:47<25:56, 1.63s/it] 6% 61/1016 [01:48<25:54, 1.63s/it] 6% 62/1016 [01:50<25:56, 1.63s/it] 6% 63/1016 [01:51<25:50, 1.63s/it] 6% 64/1016 [01:54<28:02, 1.77s/it] 6% 65/1016 [01:55<27:11, 1.72s/it] 6% 66/1016 [01:57<26:46, 1.69s/it] 7% 67/1016 [01:58<26:28, 1.67s/it] 7% 68/1016 [02:00<26:09, 1.66s/it] 7% 69/1016 [02:02<26:01, 1.65s/it] 7% 70/1016 [02:03<25:46, 1.64s/it] 7% 71/1016 [02:05<25:47, 1.64s/it] 7% 72/1016 [02:07<26:22, 1.68s/it] 7% 73/1016 [02:08<26:50, 1.71s/it] 7% 74/1016 [02:10<27:11, 1.73s/it] 7% 75/1016 [02:12<27:19, 1.74s/it] 7% 76/1016 [02:14<27:23, 1.75s/it] 8% 77/1016 [02:16<27:22, 1.75s/it] 8% 78/1016 [02:17<27:25, 1.75s/it] 8% 79/1016 [02:19<27:26, 1.76s/it] 8% 80/1016 [02:21<27:29, 1.76s/it] 8% 81/1016 [02:23<30:12, 1.94s/it] 8% 82/1016 [02:25<29:26, 1.89s/it] 8% 83/1016 [02:27<28:49, 1.85s/it] 8% 84/1016 [02:29<28:27, 1.83s/it] 8% 85/1016 [02:30<28:16, 1.82s/it] 8% 86/1016 [02:32<27:56, 1.80s/it] 9% 87/1016 [02:34<27:53, 1.80s/it] 9% 88/1016 [02:36<27:43, 1.79s/it] 9% 89/1016 [02:37<27:24, 1.77s/it] 9% 90/1016 [02:39<26:48, 1.74s/it] 9% 91/1016 [02:41<26:23, 1.71s/it] 9% 92/1016 [02:42<26:37, 1.73s/it] 9% 93/1016 [02:44<26:44, 1.74s/it] 9% 94/1016 [02:46<27:01, 1.76s/it] 9% 95/1016 [02:48<26:44, 1.74s/it] 9% 96/1016 [02:49<26:12, 1.71s/it] 10% 97/1016 [02:51<25:46, 1.68s/it] 10% 98/1016 [02:53<25:37, 1.68s/it] 10% 99/1016 [02:54<25:19, 1.66s/it] 10% 100/1016 [02:56<25:03, 1.64s/it] {'loss': 5.6623, 'grad_norm': 12.538771629333496, 'learning_rate': 0.0001961624298837552, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 1546.73, 'total_tokens': 287922, 'epoch': 0.3}
10% 100/1016 [02:56<25:03, 1.64s/it] 10% 101/1016 [02:58<24:57, 1.64s/it] 10% 102/1016 [02:59<24:53, 1.63s/it] 10% 103/1016 [03:01<24:46, 1.63s/it] 10% 104/1016 [03:02<24:47, 1.63s/it] 10% 105/1016 [03:04<24:39, 1.62s/it] 10% 106/1016 [03:06<24:36, 1.62s/it] 11% 107/1016 [03:08<26:40, 1.76s/it] 11% 108/1016 [03:09<25:56, 1.71s/it] 11% 109/1016 [03:11<25:27, 1.68s/it] 11% 110/1016 [03:13<25:03, 1.66s/it] 11% 111/1016 [03:14<24:48, 1.64s/it] 11% 112/1016 [03:16<24:43, 1.64s/it] 11% 113/1016 [03:17<24:34, 1.63s/it] 11% 114/1016 [03:19<24:24, 1.62s/it] 11% 115/1016 [03:21<24:18, 1.62s/it] 11% 116/1016 [03:22<24:21, 1.62s/it] 12% 117/1016 [03:24<24:18, 1.62s/it] 12% 118/1016 [03:25<24:13, 1.62s/it] 12% 119/1016 [03:27<24:08, 1.61s/it] 12% 120/1016 [03:29<24:10, 1.62s/it] 12% 121/1016 [03:30<24:10, 1.62s/it] 12% 122/1016 [03:32<24:03, 1.61s/it] 12% 123/1016 [03:34<24:04, 1.62s/it] 12% 124/1016 [03:35<24:05, 1.62s/it] 12% 125/1016 [03:37<23:56, 1.61s/it] 12% 126/1016 [03:38<23:53, 1.61s/it] 12% 127/1016 [03:40<23:49, 1.61s/it] 13% 128/1016 [03:42<25:53, 1.75s/it] 13% 129/1016 [03:44<25:12, 1.71s/it] 13% 130/1016 [03:45<24:45, 1.68s/it] 13% 131/1016 [03:47<24:26, 1.66s/it] 13% 132/1016 [03:48<24:12, 1.64s/it] 13% 133/1016 [03:50<24:00, 1.63s/it] 13% 134/1016 [03:52<23:48, 1.62s/it] 13% 135/1016 [03:53<23:44, 1.62s/it] 13% 136/1016 [03:55<23:43, 1.62s/it] 13% 137/1016 [03:56<23:34, 1.61s/it] 14% 138/1016 [03:58<23:37, 1.61s/it] 14% 139/1016 [04:00<23:40, 1.62s/it] 14% 140/1016 [04:01<23:38, 1.62s/it] 14% 141/1016 [04:03<23:35, 1.62s/it] 14% 142/1016 [04:05<23:31, 1.62s/it] 14% 143/1016 [04:06<23:27, 1.61s/it] 14% 144/1016 [04:08<23:28, 1.62s/it] 14% 145/1016 [04:10<26:20, 1.81s/it] 14% 146/1016 [04:12<26:14, 1.81s/it] 14% 147/1016 [04:14<26:03, 1.80s/it] 15% 148/1016 [04:15<25:29, 1.76s/it] 15% 149/1016 [04:17<24:55, 1.72s/it] 15% 150/1016 [04:19<24:27, 1.69s/it] {'loss': 5.3752, 'grad_norm': 25.31467628479004, 'learning_rate': 0.00019072586525126637, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 1524.55, 'total_tokens': 410915, 'epoch': 0.44}
15% 150/1016 [04:19<24:27, 1.69s/it] 15% 151/1016 [04:20<24:22, 1.69s/it] 15% 152/1016 [04:22<24:17, 1.69s/it] 15% 153/1016 [04:24<24:12, 1.68s/it] 15% 154/1016 [04:25<23:59, 1.67s/it] 15% 155/1016 [04:27<23:56, 1.67s/it] 15% 156/1016 [04:29<23:45, 1.66s/it] 15% 157/1016 [04:30<23:37, 1.65s/it] 16% 158/1016 [04:32<23:29, 1.64s/it] 16% 159/1016 [04:33<23:27, 1.64s/it] 16% 160/1016 [04:35<23:25, 1.64s/it] 16% 161/1016 [04:37<23:26, 1.64s/it] 16% 162/1016 [04:38<23:26, 1.65s/it] 16% 163/1016 [04:40<23:15, 1.64s/it] 16% 164/1016 [04:42<23:04, 1.63s/it] 16% 165/1016 [04:43<22:56, 1.62s/it] 16% 166/1016 [04:45<22:55, 1.62s/it] 16% 167/1016 [04:46<23:00, 1.63s/it] 17% 168/1016 [04:48<23:00, 1.63s/it] 17% 169/1016 [04:50<22:53, 1.62s/it] 17% 170/1016 [04:51<22:49, 1.62s/it] 17% 171/1016 [04:53<24:52, 1.77s/it] 17% 172/1016 [04:55<24:13, 1.72s/it] 17% 173/1016 [04:57<23:42, 1.69s/it] 17% 174/1016 [04:58<23:27, 1.67s/it] 17% 175/1016 [05:00<23:20, 1.66s/it] 17% 176/1016 [05:02<23:05, 1.65s/it] 17% 177/1016 [05:03<22:54, 1.64s/it] 18% 178/1016 [05:05<22:44, 1.63s/it] 18% 179/1016 [05:06<22:34, 1.62s/it] 18% 180/1016 [05:08<22:30, 1.62s/it] 18% 181/1016 [05:10<22:24, 1.61s/it] 18% 182/1016 [05:11<22:28, 1.62s/it] 18% 183/1016 [05:13<22:27, 1.62s/it] 18% 184/1016 [05:14<22:21, 1.61s/it] 18% 185/1016 [05:16<22:21, 1.61s/it] 18% 186/1016 [05:18<22:23, 1.62s/it] 18% 187/1016 [05:19<22:23, 1.62s/it] 19% 188/1016 [05:21<22:21, 1.62s/it] 19% 189/1016 [05:23<22:18, 1.62s/it] 19% 190/1016 [05:24<22:25, 1.63s/it] 19% 191/1016 [05:26<24:30, 1.78s/it] 19% 192/1016 [05:28<23:46, 1.73s/it] 19% 193/1016 [05:30<23:15, 1.70s/it] 19% 194/1016 [05:31<22:53, 1.67s/it] 19% 195/1016 [05:33<22:42, 1.66s/it] 19% 196/1016 [05:34<22:31, 1.65s/it] 19% 197/1016 [05:36<22:25, 1.64s/it] 19% 198/1016 [05:38<22:44, 1.67s/it] 20% 199/1016 [05:40<23:06, 1.70s/it] 20% 200/1016 [05:41<23:25, 1.72s/it] {'loss': 5.0362, 'grad_norm': 25.589689254760742, 'learning_rate': 0.00018308184302213046, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 1390.99, 'total_tokens': 533959, 'epoch': 0.59}
20% 200/1016 [05:41<23:25, 1.72s/it][2025-12-16 10:01:49,745] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-200
20% 201/1016 [05:53<1:03:37, 4.68s/it] 20% 202/1016 [05:55<51:49, 3.82s/it] 20% 203/1016 [05:57<43:27, 3.21s/it] 20% 204/1016 [05:58<37:32, 2.77s/it] 20% 205/1016 [06:00<33:28, 2.48s/it] 20% 206/1016 [06:02<30:33, 2.26s/it] 20% 207/1016 [06:04<28:32, 2.12s/it] 20% 208/1016 [06:05<27:08, 2.02s/it] 21% 209/1016 [06:07<26:07, 1.94s/it] 21% 210/1016 [06:09<27:23, 2.04s/it] 21% 211/1016 [06:11<25:45, 1.92s/it] 21% 212/1016 [06:13<24:47, 1.85s/it] 21% 213/1016 [06:15<24:23, 1.82s/it] 21% 214/1016 [06:16<24:07, 1.81s/it] 21% 215/1016 [06:18<23:48, 1.78s/it] 21% 216/1016 [06:20<23:13, 1.74s/it] 21% 217/1016 [06:21<22:37, 1.70s/it] 21% 218/1016 [06:23<22:17, 1.68s/it] 22% 219/1016 [06:25<22:13, 1.67s/it] 22% 220/1016 [06:26<21:59, 1.66s/it] 22% 221/1016 [06:28<21:43, 1.64s/it] 22% 222/1016 [06:29<21:38, 1.64s/it] 22% 223/1016 [06:31<21:32, 1.63s/it] 22% 224/1016 [06:33<21:23, 1.62s/it] 22% 225/1016 [06:34<21:19, 1.62s/it] 22% 226/1016 [06:36<21:14, 1.61s/it] 22% 227/1016 [06:37<21:11, 1.61s/it] 22% 228/1016 [06:39<23:01, 1.75s/it] 23% 229/1016 [06:41<22:23, 1.71s/it] 23% 230/1016 [06:43<21:57, 1.68s/it] 23% 231/1016 [06:44<21:39, 1.66s/it] 23% 232/1016 [06:46<21:22, 1.64s/it] 23% 233/1016 [06:48<21:15, 1.63s/it] 23% 234/1016 [06:49<21:22, 1.64s/it] 23% 235/1016 [06:51<21:12, 1.63s/it] 23% 236/1016 [06:52<21:05, 1.62s/it] 23% 237/1016 [06:54<21:05, 1.62s/it] 23% 238/1016 [06:56<21:02, 1.62s/it] 24% 239/1016 [06:57<20:57, 1.62s/it] 24% 240/1016 [06:59<20:57, 1.62s/it] 24% 241/1016 [07:01<20:58, 1.62s/it] 24% 242/1016 [07:02<20:52, 1.62s/it] 24% 243/1016 [07:04<20:48, 1.61s/it] 24% 244/1016 [07:05<20:46, 1.61s/it] 24% 245/1016 [07:07<20:50, 1.62s/it] 24% 246/1016 [07:09<20:48, 1.62s/it] 24% 247/1016 [07:10<20:43, 1.62s/it] 24% 248/1016 [07:12<20:39, 1.61s/it] 25% 249/1016 [07:13<20:36, 1.61s/it] 25% 250/1016 [07:15<20:30, 1.61s/it] {'loss': 4.8371, 'grad_norm': 26.41189956665039, 'learning_rate': 0.00017341635045468791, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 1552.5, 'total_tokens': 656838, 'epoch': 0.74}
25% 250/1016 [07:15<20:30, 1.61s/it] 25% 251/1016 [07:17<20:26, 1.60s/it] 25% 252/1016 [07:18<20:30, 1.61s/it] 25% 253/1016 [07:20<20:29, 1.61s/it] 25% 254/1016 [07:22<22:22, 1.76s/it] 25% 255/1016 [07:24<21:43, 1.71s/it] 25% 256/1016 [07:25<21:21, 1.69s/it] 25% 257/1016 [07:27<21:05, 1.67s/it] 25% 258/1016 [07:28<20:52, 1.65s/it] 25% 259/1016 [07:30<20:39, 1.64s/it] 26% 260/1016 [07:32<20:32, 1.63s/it] 26% 261/1016 [07:33<20:26, 1.62s/it] 26% 262/1016 [07:35<20:19, 1.62s/it] 26% 263/1016 [07:36<20:16, 1.62s/it] 26% 264/1016 [07:38<20:15, 1.62s/it] 26% 265/1016 [07:40<20:07, 1.61s/it] 26% 266/1016 [07:41<20:16, 1.62s/it] 26% 267/1016 [07:43<20:46, 1.66s/it] 26% 268/1016 [07:45<21:07, 1.69s/it] 26% 269/1016 [07:47<21:19, 1.71s/it] 27% 270/1016 [07:48<21:12, 1.71s/it] 27% 271/1016 [07:50<20:57, 1.69s/it] 27% 272/1016 [07:52<20:40, 1.67s/it] 27% 273/1016 [07:53<20:35, 1.66s/it] 27% 274/1016 [07:55<22:13, 1.80s/it] 27% 275/1016 [07:57<21:30, 1.74s/it] 27% 276/1016 [07:59<21:03, 1.71s/it] 27% 277/1016 [08:00<20:41, 1.68s/it] 27% 278/1016 [08:02<20:27, 1.66s/it] 27% 279/1016 [08:03<20:09, 1.64s/it] 28% 280/1016 [08:05<19:57, 1.63s/it] 28% 281/1016 [08:07<19:50, 1.62s/it] 28% 282/1016 [08:08<19:46, 1.62s/it] 28% 283/1016 [08:10<19:44, 1.62s/it] 28% 284/1016 [08:11<19:38, 1.61s/it] 28% 285/1016 [08:13<19:39, 1.61s/it] 28% 286/1016 [08:15<19:37, 1.61s/it] 28% 287/1016 [08:16<19:35, 1.61s/it] 28% 288/1016 [08:18<19:28, 1.60s/it] 28% 289/1016 [08:19<19:26, 1.60s/it] 29% 290/1016 [08:21<19:29, 1.61s/it] 29% 291/1016 [08:23<19:27, 1.61s/it] 29% 292/1016 [08:25<21:18, 1.77s/it] 29% 293/1016 [08:26<20:47, 1.73s/it] 29% 294/1016 [08:28<20:19, 1.69s/it] 29% 295/1016 [08:30<20:02, 1.67s/it] 29% 296/1016 [08:31<19:47, 1.65s/it] 29% 297/1016 [08:33<19:38, 1.64s/it] 29% 298/1016 [08:34<19:32, 1.63s/it] 29% 299/1016 [08:36<19:28, 1.63s/it] 30% 300/1016 [08:38<19:31, 1.64s/it] {'loss': 4.7248, 'grad_norm': 23.636552810668945, 'learning_rate': 0.00016196455934844978, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'tokens_per_second_per_gpu': 1489.9, 'total_tokens': 779045, 'epoch': 0.89}
30% 300/1016 [08:38<19:31, 1.64s/it] 30% 301/1016 [08:39<19:23, 1.63s/it] 30% 302/1016 [08:41<19:15, 1.62s/it] 30% 303/1016 [08:43<19:11, 1.61s/it] 30% 304/1016 [08:44<19:12, 1.62s/it] 30% 305/1016 [08:46<19:10, 1.62s/it] 30% 306/1016 [08:47<19:06, 1.61s/it] 30% 307/1016 [08:49<19:05, 1.62s/it] 30% 308/1016 [08:51<18:58, 1.61s/it] 30% 309/1016 [08:52<18:52, 1.60s/it] 31% 310/1016 [08:54<18:48, 1.60s/it] 31% 311/1016 [08:55<18:46, 1.60s/it] 31% 312/1016 [08:58<20:30, 1.75s/it] 31% 313/1016 [08:59<19:58, 1.70s/it] 31% 314/1016 [09:01<19:37, 1.68s/it] 31% 315/1016 [09:02<19:20, 1.66s/it] 31% 316/1016 [09:04<19:04, 1.63s/it] 31% 317/1016 [09:05<18:51, 1.62s/it] 31% 318/1016 [09:07<18:42, 1.61s/it] 31% 319/1016 [09:09<18:38, 1.60s/it] 31% 320/1016 [09:10<19:03, 1.64s/it] 32% 321/1016 [09:12<19:23, 1.67s/it] 32% 322/1016 [09:14<19:43, 1.71s/it] 32% 323/1016 [09:16<19:50, 1.72s/it] 32% 324/1016 [09:17<19:53, 1.73s/it] 32% 325/1016 [09:19<19:58, 1.74s/it] 32% 326/1016 [09:21<20:00, 1.74s/it] 32% 327/1016 [09:23<19:59, 1.74s/it] 32% 328/1016 [09:24<19:59, 1.74s/it] 32% 329/1016 [09:26<20:01, 1.75s/it] 32% 330/1016 [09:28<19:56, 1.74s/it] 33% 331/1016 [09:30<19:52, 1.74s/it] 33% 332/1016 [09:31<19:50, 1.74s/it] 33% 333/1016 [09:33<19:50, 1.74s/it] 33% 334/1016 [09:35<19:50, 1.75s/it] 33% 335/1016 [09:37<19:51, 1.75s/it] 33% 336/1016 [09:38<19:45, 1.74s/it] 33% 337/1016 [09:40<20:56, 1.85s/it] 33% 338/1016 [09:42<20:24, 1.81s/it] 33% 339/1016 [09:43<16:46, 1.49s/it][2025-12-16 10:05:51,326] [INFO] [axolotl.core.trainers.base.evaluate:387] [PID:25550] Running evaluation step...
0% 0/143 [00:00<?, ?it/s]
3% 4/143 [00:00<00:05, 25.93it/s]
5% 7/143 [00:00<00:07, 17.69it/s]
6% 9/143 [00:00<00:07, 17.14it/s]
8% 12/143 [00:00<00:07, 18.65it/s]
10% 15/143 [00:00<00:06, 19.68it/s]
13% 18/143 [00:00<00:06, 20.19it/s]
15% 21/143 [00:01<00:05, 20.63it/s]
17% 24/143 [00:01<00:05, 20.84it/s]
19% 27/143 [00:01<00:05, 20.83it/s]
21% 30/143 [00:01<00:05, 21.04it/s]
23% 33/143 [00:01<00:05, 21.04it/s]
25% 36/143 [00:01<00:05, 21.18it/s]
27% 39/143 [00:01<00:04, 21.30it/s]
29% 42/143 [00:02<00:04, 21.22it/s]
31% 45/143 [00:02<00:04, 21.31it/s]
34% 48/143 [00:02<00:04, 21.30it/s]
36% 51/143 [00:02<00:04, 21.08it/s]
38% 54/143 [00:02<00:04, 21.20it/s]
40% 57/143 [00:02<00:04, 21.18it/s]
42% 60/143 [00:02<00:03, 21.28it/s]
44% 63/143 [00:03<00:03, 21.37it/s]
46% 66/143 [00:03<00:03, 21.20it/s]
48% 69/143 [00:03<00:03, 21.26it/s]
50% 72/143 [00:03<00:03, 21.20it/s]
52% 75/143 [00:03<00:03, 21.18it/s]
55% 78/143 [00:03<00:03, 21.22it/s]
57% 81/143 [00:03<00:02, 21.27it/s]
59% 84/143 [00:04<00:02, 21.49it/s]
61% 87/143 [00:04<00:02, 21.59it/s]
63% 90/143 [00:04<00:02, 21.56it/s]
65% 93/143 [00:04<00:02, 21.62it/s]
67% 96/143 [00:04<00:02, 21.78it/s]
69% 99/143 [00:04<00:02, 21.76it/s]
71% 102/143 [00:04<00:01, 21.89it/s]
73% 105/143 [00:04<00:01, 21.74it/s]
76% 108/143 [00:05<00:01, 21.66it/s]
78% 111/143 [00:05<00:01, 21.77it/s]
80% 114/143 [00:05<00:01, 21.62it/s]
82% 117/143 [00:05<00:01, 21.70it/s]
84% 120/143 [00:05<00:01, 21.71it/s]
86% 123/143 [00:05<00:00, 21.68it/s]
88% 126/143 [00:05<00:00, 21.81it/s]
90% 129/143 [00:06<00:00, 21.66it/s]
92% 132/143 [00:06<00:00, 21.69it/s]
94% 135/143 [00:06<00:00, 21.61it/s]
97% 138/143 [00:06<00:00, 21.47it/s]
99% 141/143 [00:06<00:00, 21.64it/s]
{'eval_loss': 4.642312526702881, 'eval_runtime': 7.0402, 'eval_samples_per_second': 20.312, 'eval_steps_per_second': 20.312, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.67, 'epoch': 1.0}
33% 339/1016 [09:50<16:46, 1.49s/it]
100% 143/143 [00:06<00:00, 21.64it/s]
 33% 340/1016 [09:52<43:37, 3.87s/it] 34% 341/1016 [09:54<35:59, 3.20s/it] 34% 342/1016 [09:56<30:31, 2.72s/it] 34% 343/1016 [09:57<26:40, 2.38s/it] 34% 344/1016 [09:59<24:03, 2.15s/it] 34% 345/1016 [10:00<22:15, 1.99s/it] 34% 346/1016 [10:02<20:59, 1.88s/it] 34% 347/1016 [10:04<20:05, 1.80s/it] 34% 348/1016 [10:05<19:24, 1.74s/it] 34% 349/1016 [10:07<18:54, 1.70s/it] 34% 350/1016 [10:08<18:34, 1.67s/it] {'loss': 4.6412, 'grad_norm': 29.976333618164062, 'learning_rate': 0.00014900510406201564, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 346.62, 'total_tokens': 945548, 'epoch': 1.03}
34% 350/1016 [10:08<18:34, 1.67s/it] 35% 351/1016 [10:10<18:19, 1.65s/it] 35% 352/1016 [10:12<18:08, 1.64s/it] 35% 353/1016 [10:13<18:02, 1.63s/it] 35% 354/1016 [10:15<18:00, 1.63s/it] 35% 355/1016 [10:17<17:54, 1.62s/it] 35% 356/1016 [10:18<17:46, 1.62s/it] 35% 357/1016 [10:20<17:43, 1.61s/it] 35% 358/1016 [10:22<20:07, 1.84s/it] 35% 359/1016 [10:24<19:24, 1.77s/it] 35% 360/1016 [10:25<18:55, 1.73s/it] 36% 361/1016 [10:27<18:42, 1.71s/it] 36% 362/1016 [10:29<18:19, 1.68s/it] 36% 363/1016 [10:30<18:00, 1.66s/it] 36% 364/1016 [10:32<17:52, 1.64s/it] 36% 365/1016 [10:33<17:42, 1.63s/it] 36% 366/1016 [10:35<17:35, 1.62s/it] 36% 367/1016 [10:37<17:30, 1.62s/it] 36% 368/1016 [10:38<17:25, 1.61s/it] 36% 369/1016 [10:40<17:21, 1.61s/it] 36% 370/1016 [10:41<17:22, 1.61s/it] 37% 371/1016 [10:43<17:18, 1.61s/it] 37% 372/1016 [10:45<17:16, 1.61s/it] 37% 373/1016 [10:46<17:15, 1.61s/it] 37% 374/1016 [10:48<17:21, 1.62s/it] 37% 375/1016 [10:50<17:17, 1.62s/it] 37% 376/1016 [10:52<18:56, 1.78s/it] 37% 377/1016 [10:53<18:21, 1.72s/it] 37% 378/1016 [10:55<17:55, 1.69s/it] 37% 379/1016 [10:56<17:34, 1.66s/it] 37% 380/1016 [10:58<17:21, 1.64s/it] 38% 381/1016 [11:00<17:15, 1.63s/it] 38% 382/1016 [11:01<17:14, 1.63s/it] 38% 383/1016 [11:03<17:19, 1.64s/it] 38% 384/1016 [11:05<17:12, 1.63s/it] 38% 385/1016 [11:06<17:03, 1.62s/it] 38% 386/1016 [11:08<17:00, 1.62s/it] 38% 387/1016 [11:09<17:09, 1.64s/it] 38% 388/1016 [11:11<17:35, 1.68s/it] 38% 389/1016 [11:13<17:51, 1.71s/it] 38% 390/1016 [11:15<18:12, 1.75s/it] 38% 391/1016 [11:17<17:53, 1.72s/it] 39% 392/1016 [11:18<17:35, 1.69s/it] 39% 393/1016 [11:20<17:20, 1.67s/it] 39% 394/1016 [11:21<17:16, 1.67s/it] 39% 395/1016 [11:23<17:05, 1.65s/it] 39% 396/1016 [11:25<17:00, 1.65s/it] 39% 397/1016 [11:26<16:55, 1.64s/it] 39% 398/1016 [11:28<16:53, 1.64s/it] 39% 399/1016 [11:30<16:46, 1.63s/it] 39% 400/1016 [11:31<16:44, 1.63s/it] {'loss': 4.4916, 'grad_norm': 28.489376068115234, 'learning_rate': 0.00013485330204031937, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1511.66, 'total_tokens': 1067762, 'epoch': 1.18}
39% 400/1016 [11:31<16:44, 1.63s/it][2025-12-16 10:07:39,618] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-400
39% 401/1016 [11:45<53:55, 5.26s/it] 40% 402/1016 [11:47<42:39, 4.17s/it] 40% 403/1016 [11:48<34:48, 3.41s/it] 40% 404/1016 [11:50<29:20, 2.88s/it] 40% 405/1016 [11:51<25:30, 2.50s/it] 40% 406/1016 [11:53<22:47, 2.24s/it] 40% 407/1016 [11:55<20:50, 2.05s/it] 40% 408/1016 [11:56<19:28, 1.92s/it] 40% 409/1016 [11:58<18:34, 1.84s/it] 40% 410/1016 [12:00<17:56, 1.78s/it] 40% 411/1016 [12:01<17:28, 1.73s/it] 41% 412/1016 [12:03<17:11, 1.71s/it] 41% 413/1016 [12:04<16:53, 1.68s/it] 41% 414/1016 [12:06<16:38, 1.66s/it] 41% 415/1016 [12:08<16:34, 1.65s/it] 41% 416/1016 [12:09<16:25, 1.64s/it] 41% 417/1016 [12:11<16:21, 1.64s/it] 41% 418/1016 [12:13<16:16, 1.63s/it] 41% 419/1016 [12:15<17:46, 1.79s/it] 41% 420/1016 [12:16<17:18, 1.74s/it] 41% 421/1016 [12:18<16:52, 1.70s/it] 42% 422/1016 [12:20<16:34, 1.67s/it] 42% 423/1016 [12:21<16:22, 1.66s/it] 42% 424/1016 [12:23<16:14, 1.65s/it] 42% 425/1016 [12:24<16:08, 1.64s/it] 42% 426/1016 [12:26<16:04, 1.64s/it] 42% 427/1016 [12:28<16:00, 1.63s/it] 42% 428/1016 [12:29<15:57, 1.63s/it] 42% 429/1016 [12:31<15:53, 1.62s/it] 42% 430/1016 [12:33<15:51, 1.62s/it] 42% 431/1016 [12:34<15:48, 1.62s/it] 43% 432/1016 [12:36<15:46, 1.62s/it] 43% 433/1016 [12:37<15:49, 1.63s/it] 43% 434/1016 [12:39<16:19, 1.68s/it] 43% 435/1016 [12:41<16:31, 1.71s/it] 43% 436/1016 [12:43<16:41, 1.73s/it] 43% 437/1016 [12:45<16:46, 1.74s/it] 43% 438/1016 [12:46<16:50, 1.75s/it] 43% 439/1016 [12:48<16:57, 1.76s/it] 43% 440/1016 [12:50<17:01, 1.77s/it] 43% 441/1016 [12:52<17:05, 1.78s/it] 44% 442/1016 [12:54<17:02, 1.78s/it] 44% 443/1016 [12:55<16:55, 1.77s/it] 44% 444/1016 [12:57<16:53, 1.77s/it] 44% 445/1016 [12:59<18:27, 1.94s/it] 44% 446/1016 [13:01<17:58, 1.89s/it] 44% 447/1016 [13:03<17:39, 1.86s/it] 44% 448/1016 [13:05<17:34, 1.86s/it] 44% 449/1016 [13:07<17:17, 1.83s/it] 44% 450/1016 [13:08<17:05, 1.81s/it] {'loss': 4.4404, 'grad_norm': 25.01222038269043, 'learning_rate': 0.0001198534818030452, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1375.01, 'total_tokens': 1188747, 'epoch': 1.33}
44% 450/1016 [13:08<17:05, 1.81s/it] 44% 451/1016 [13:10<16:55, 1.80s/it] 44% 452/1016 [13:12<16:36, 1.77s/it] 45% 453/1016 [13:13<16:08, 1.72s/it] 45% 454/1016 [13:15<16:16, 1.74s/it] 45% 455/1016 [13:17<16:23, 1.75s/it] 45% 456/1016 [13:19<16:24, 1.76s/it] 45% 457/1016 [13:20<16:09, 1.73s/it] 45% 458/1016 [13:22<15:49, 1.70s/it] 45% 459/1016 [13:24<15:33, 1.68s/it] 45% 460/1016 [13:25<15:32, 1.68s/it] 45% 461/1016 [13:27<15:25, 1.67s/it] 45% 462/1016 [13:29<15:21, 1.66s/it] 46% 463/1016 [13:30<15:09, 1.64s/it] 46% 464/1016 [13:32<15:02, 1.63s/it] 46% 465/1016 [13:33<14:56, 1.63s/it] 46% 466/1016 [13:36<16:13, 1.77s/it] 46% 467/1016 [13:37<15:44, 1.72s/it] 46% 468/1016 [13:39<15:25, 1.69s/it] 46% 469/1016 [13:40<15:18, 1.68s/it] 46% 470/1016 [13:42<15:05, 1.66s/it] 46% 471/1016 [13:44<14:56, 1.64s/it] 46% 472/1016 [13:45<14:47, 1.63s/it] 47% 473/1016 [13:47<14:41, 1.62s/it] 47% 474/1016 [13:48<14:38, 1.62s/it] 47% 475/1016 [13:50<14:35, 1.62s/it] 47% 476/1016 [13:52<14:38, 1.63s/it] 47% 477/1016 [13:53<14:35, 1.62s/it] 47% 478/1016 [13:55<14:30, 1.62s/it] 47% 479/1016 [13:57<14:28, 1.62s/it] 47% 480/1016 [13:58<14:25, 1.61s/it] 47% 481/1016 [14:00<14:22, 1.61s/it] 47% 482/1016 [14:01<14:19, 1.61s/it] 48% 483/1016 [14:03<15:36, 1.76s/it] 48% 484/1016 [14:05<15:15, 1.72s/it] 48% 485/1016 [14:07<14:56, 1.69s/it] 48% 486/1016 [14:08<14:40, 1.66s/it] 48% 487/1016 [14:10<14:28, 1.64s/it] 48% 488/1016 [14:12<14:23, 1.63s/it] 48% 489/1016 [14:13<14:18, 1.63s/it] 48% 490/1016 [14:15<14:15, 1.63s/it] 48% 491/1016 [14:16<14:19, 1.64s/it] 48% 492/1016 [14:18<14:16, 1.63s/it] 49% 493/1016 [14:20<14:09, 1.62s/it] 49% 494/1016 [14:21<14:04, 1.62s/it] 49% 495/1016 [14:23<13:59, 1.61s/it] 49% 496/1016 [14:25<14:00, 1.62s/it] 49% 497/1016 [14:26<13:58, 1.62s/it] 49% 498/1016 [14:28<13:59, 1.62s/it] 49% 499/1016 [14:29<14:00, 1.62s/it] 49% 500/1016 [14:31<13:57, 1.62s/it] {'loss': 4.4182, 'grad_norm': 19.93057632446289, 'learning_rate': 0.00010437060506248341, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1540.13, 'total_tokens': 1312791, 'epoch': 1.48}
49% 500/1016 [14:31<13:57, 1.62s/it] 49% 501/1016 [14:33<13:54, 1.62s/it] 49% 502/1016 [14:34<13:52, 1.62s/it] 50% 503/1016 [14:36<13:52, 1.62s/it] 50% 504/1016 [14:37<13:47, 1.62s/it] 50% 505/1016 [14:39<13:47, 1.62s/it] 50% 506/1016 [14:41<13:55, 1.64s/it] 50% 507/1016 [14:43<14:06, 1.66s/it] 50% 508/1016 [14:44<14:19, 1.69s/it] 50% 509/1016 [14:47<15:54, 1.88s/it] 50% 510/1016 [14:48<15:29, 1.84s/it] 50% 511/1016 [14:50<14:59, 1.78s/it] 50% 512/1016 [14:52<14:31, 1.73s/it] 50% 513/1016 [14:53<14:21, 1.71s/it] 51% 514/1016 [14:55<14:08, 1.69s/it] 51% 515/1016 [14:57<13:54, 1.67s/it] 51% 516/1016 [14:58<13:48, 1.66s/it] 51% 517/1016 [15:00<13:41, 1.65s/it] 51% 518/1016 [15:01<13:35, 1.64s/it] 51% 519/1016 [15:03<13:32, 1.64s/it] 51% 520/1016 [15:05<13:28, 1.63s/it] 51% 521/1016 [15:06<13:23, 1.62s/it] 51% 522/1016 [15:08<13:24, 1.63s/it] 51% 523/1016 [15:09<13:16, 1.62s/it] 52% 524/1016 [15:11<13:14, 1.61s/it] 52% 525/1016 [15:13<13:11, 1.61s/it] 52% 526/1016 [15:14<13:09, 1.61s/it] 52% 527/1016 [15:16<13:09, 1.61s/it] 52% 528/1016 [15:18<13:12, 1.62s/it] 52% 529/1016 [15:20<14:25, 1.78s/it] 52% 530/1016 [15:21<13:58, 1.73s/it] 52% 531/1016 [15:23<13:41, 1.69s/it] 52% 532/1016 [15:25<13:26, 1.67s/it] 52% 533/1016 [15:26<13:14, 1.65s/it] 53% 534/1016 [15:28<13:10, 1.64s/it] 53% 535/1016 [15:29<13:07, 1.64s/it] 53% 536/1016 [15:31<13:01, 1.63s/it] 53% 537/1016 [15:33<12:56, 1.62s/it] 53% 538/1016 [15:34<12:51, 1.61s/it] 53% 539/1016 [15:36<12:49, 1.61s/it] 53% 540/1016 [15:37<12:46, 1.61s/it] 53% 541/1016 [15:39<12:46, 1.61s/it] 53% 542/1016 [15:41<12:47, 1.62s/it] 53% 543/1016 [15:42<12:46, 1.62s/it] 54% 544/1016 [15:44<12:48, 1.63s/it] 54% 545/1016 [15:46<12:43, 1.62s/it] 54% 546/1016 [15:47<12:38, 1.61s/it] 54% 547/1016 [15:49<13:45, 1.76s/it] 54% 548/1016 [15:51<13:20, 1.71s/it] 54% 549/1016 [15:52<13:06, 1.68s/it] 54% 550/1016 [15:54<12:56, 1.67s/it] {'loss': 4.3869, 'grad_norm': 20.955829620361328, 'learning_rate': 8.878138681368239e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1544.52, 'total_tokens': 1437766, 'epoch': 1.62}
54% 550/1016 [15:54<12:56, 1.67s/it] 54% 551/1016 [15:56<12:47, 1.65s/it] 54% 552/1016 [15:57<12:40, 1.64s/it] 54% 553/1016 [15:59<12:34, 1.63s/it] 55% 554/1016 [16:01<12:29, 1.62s/it] 55% 555/1016 [16:02<12:27, 1.62s/it] 55% 556/1016 [16:04<12:25, 1.62s/it] 55% 557/1016 [16:05<12:28, 1.63s/it] 55% 558/1016 [16:07<12:24, 1.63s/it] 55% 559/1016 [16:09<12:24, 1.63s/it] 55% 560/1016 [16:10<12:25, 1.64s/it] 55% 561/1016 [16:12<12:26, 1.64s/it] 55% 562/1016 [16:14<12:25, 1.64s/it] 55% 563/1016 [16:15<12:40, 1.68s/it] 56% 564/1016 [16:17<12:54, 1.71s/it] 56% 565/1016 [16:19<13:00, 1.73s/it] 56% 566/1016 [16:21<13:03, 1.74s/it] 56% 567/1016 [16:22<13:02, 1.74s/it] 56% 568/1016 [16:24<13:03, 1.75s/it] 56% 569/1016 [16:26<13:06, 1.76s/it] 56% 570/1016 [16:28<13:07, 1.77s/it] 56% 571/1016 [16:30<13:10, 1.78s/it] 56% 572/1016 [16:32<14:23, 1.94s/it] 56% 573/1016 [16:34<13:58, 1.89s/it] 56% 574/1016 [16:35<13:37, 1.85s/it] 57% 575/1016 [16:37<13:22, 1.82s/it] 57% 576/1016 [16:39<13:13, 1.80s/it] 57% 577/1016 [16:41<13:08, 1.80s/it] 57% 578/1016 [16:43<13:04, 1.79s/it] 57% 579/1016 [16:44<12:58, 1.78s/it] 57% 580/1016 [16:46<12:51, 1.77s/it] 57% 581/1016 [16:48<12:37, 1.74s/it] 57% 582/1016 [16:49<12:17, 1.70s/it] 57% 583/1016 [16:51<12:20, 1.71s/it] 57% 584/1016 [16:53<12:27, 1.73s/it] 58% 585/1016 [16:55<12:35, 1.75s/it] 58% 586/1016 [16:56<12:29, 1.74s/it] 58% 587/1016 [16:58<12:14, 1.71s/it] 58% 588/1016 [17:00<12:01, 1.69s/it] 58% 589/1016 [17:01<11:54, 1.67s/it] 58% 590/1016 [17:03<11:51, 1.67s/it] 58% 591/1016 [17:05<11:44, 1.66s/it] 58% 592/1016 [17:06<11:38, 1.65s/it] 58% 593/1016 [17:08<11:31, 1.63s/it] 58% 594/1016 [17:09<11:24, 1.62s/it] 59% 595/1016 [17:11<11:20, 1.62s/it] 59% 596/1016 [17:13<11:17, 1.61s/it] 59% 597/1016 [17:14<11:14, 1.61s/it] 59% 598/1016 [17:16<12:16, 1.76s/it] 59% 599/1016 [17:18<11:58, 1.72s/it] 59% 600/1016 [17:20<11:44, 1.69s/it] {'loss': 4.359, 'grad_norm': 27.51922035217285, 'learning_rate': 7.346512945462767e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1525.07, 'total_tokens': 1560469, 'epoch': 1.77}
59% 600/1016 [17:20<11:44, 1.69s/it][2025-12-16 10:13:27,932] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-600
59% 601/1016 [17:29<27:46, 4.02s/it] 59% 602/1016 [17:31<22:43, 3.29s/it] 59% 603/1016 [17:32<19:10, 2.79s/it] 59% 604/1016 [17:34<16:40, 2.43s/it] 60% 605/1016 [17:35<14:58, 2.19s/it] 60% 606/1016 [17:37<13:48, 2.02s/it] 60% 607/1016 [17:39<12:58, 1.90s/it] 60% 608/1016 [17:40<12:22, 1.82s/it] 60% 609/1016 [17:42<11:57, 1.76s/it] 60% 610/1016 [17:44<11:38, 1.72s/it] 60% 611/1016 [17:45<11:24, 1.69s/it] 60% 612/1016 [17:47<11:14, 1.67s/it] 60% 613/1016 [17:48<11:06, 1.65s/it] 60% 614/1016 [17:50<11:00, 1.64s/it] 61% 615/1016 [17:52<10:55, 1.64s/it] 61% 616/1016 [17:53<10:51, 1.63s/it] 61% 617/1016 [17:55<10:50, 1.63s/it] 61% 618/1016 [17:57<11:42, 1.76s/it] 61% 619/1016 [17:59<11:20, 1.71s/it] 61% 620/1016 [18:00<11:08, 1.69s/it] 61% 621/1016 [18:02<10:59, 1.67s/it] 61% 622/1016 [18:03<10:52, 1.66s/it] 61% 623/1016 [18:05<10:45, 1.64s/it] 61% 624/1016 [18:07<10:44, 1.64s/it] 62% 625/1016 [18:08<10:40, 1.64s/it] 62% 626/1016 [18:10<10:34, 1.63s/it] 62% 627/1016 [18:12<10:32, 1.63s/it] 62% 628/1016 [18:13<10:29, 1.62s/it] 62% 629/1016 [18:15<10:26, 1.62s/it] 62% 630/1016 [18:16<10:24, 1.62s/it] 62% 631/1016 [18:18<10:25, 1.62s/it] 62% 632/1016 [18:20<10:29, 1.64s/it] 62% 633/1016 [18:21<10:41, 1.68s/it] 62% 634/1016 [18:23<10:49, 1.70s/it] 62% 635/1016 [18:25<10:55, 1.72s/it] 63% 636/1016 [18:27<11:46, 1.86s/it] 63% 637/1016 [18:29<11:18, 1.79s/it] 63% 638/1016 [18:30<11:03, 1.75s/it] 63% 639/1016 [18:32<10:49, 1.72s/it] 63% 640/1016 [18:34<10:34, 1.69s/it] 63% 641/1016 [18:35<10:25, 1.67s/it] 63% 642/1016 [18:37<10:18, 1.65s/it] 63% 643/1016 [18:39<10:13, 1.64s/it] 63% 644/1016 [18:40<10:07, 1.63s/it] 63% 645/1016 [18:42<10:04, 1.63s/it] 64% 646/1016 [18:43<10:03, 1.63s/it] 64% 647/1016 [18:45<09:57, 1.62s/it] 64% 648/1016 [18:47<09:54, 1.61s/it] 64% 649/1016 [18:48<09:50, 1.61s/it] 64% 650/1016 [18:50<09:47, 1.60s/it] {'loss': 4.3688, 'grad_norm': 18.944068908691406, 'learning_rate': 5.879449395213175e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1526.23, 'total_tokens': 1681459, 'epoch': 1.92}
64% 650/1016 [18:50<09:47, 1.60s/it] 64% 651/1016 [18:51<09:45, 1.60s/it] 64% 652/1016 [18:53<09:47, 1.61s/it] 64% 653/1016 [18:55<09:46, 1.62s/it] 64% 654/1016 [18:56<09:44, 1.61s/it] 64% 655/1016 [18:58<09:42, 1.61s/it] 65% 656/1016 [19:00<09:39, 1.61s/it] 65% 657/1016 [19:01<09:36, 1.61s/it] 65% 658/1016 [19:03<09:33, 1.60s/it] 65% 659/1016 [19:04<09:31, 1.60s/it] 65% 660/1016 [19:06<09:33, 1.61s/it] 65% 661/1016 [19:08<09:35, 1.62s/it] 65% 662/1016 [19:10<10:26, 1.77s/it] 65% 663/1016 [19:11<10:06, 1.72s/it] 65% 664/1016 [19:13<09:53, 1.69s/it] 65% 665/1016 [19:14<09:42, 1.66s/it] 66% 666/1016 [19:16<09:34, 1.64s/it] 66% 667/1016 [19:18<09:29, 1.63s/it] 66% 668/1016 [19:19<09:25, 1.63s/it] 66% 669/1016 [19:21<09:20, 1.62s/it] 66% 670/1016 [19:23<09:16, 1.61s/it] 66% 671/1016 [19:24<09:14, 1.61s/it] 66% 672/1016 [19:26<09:12, 1.60s/it] 66% 673/1016 [19:27<09:10, 1.60s/it] 66% 674/1016 [19:29<09:10, 1.61s/it] 66% 675/1016 [19:31<09:10, 1.61s/it] 67% 676/1016 [19:32<09:09, 1.62s/it] 67% 677/1016 [19:34<09:06, 1.61s/it] 67% 678/1016 [19:34<07:31, 1.33s/it][2025-12-16 10:15:42,865] [INFO] [axolotl.core.trainers.base.evaluate:387] [PID:25550] Running evaluation step...
0% 0/143 [00:00<?, ?it/s]
2% 3/143 [00:00<00:06, 20.20it/s]
4% 6/143 [00:00<00:08, 15.95it/s]
6% 8/143 [00:00<00:08, 16.71it/s]
8% 11/143 [00:00<00:07, 18.56it/s]
10% 14/143 [00:00<00:06, 19.89it/s]
12% 17/143 [00:00<00:06, 20.49it/s]
14% 20/143 [00:01<00:05, 21.05it/s]
16% 23/143 [00:01<00:05, 21.41it/s]
18% 26/143 [00:01<00:05, 21.56it/s]
20% 29/143 [00:01<00:05, 21.70it/s]
22% 32/143 [00:01<00:05, 21.86it/s]
24% 35/143 [00:01<00:04, 21.88it/s]
27% 38/143 [00:01<00:04, 21.96it/s]
29% 41/143 [00:01<00:04, 21.90it/s]
31% 44/143 [00:02<00:04, 22.06it/s]
33% 47/143 [00:02<00:04, 22.14it/s]
35% 50/143 [00:02<00:04, 22.09it/s]
37% 53/143 [00:02<00:04, 22.12it/s]
39% 56/143 [00:02<00:03, 22.17it/s]
41% 59/143 [00:02<00:03, 22.03it/s]
43% 62/143 [00:02<00:03, 22.04it/s]
45% 65/143 [00:03<00:03, 21.87it/s]
48% 68/143 [00:03<00:03, 22.00it/s]
50% 71/143 [00:03<00:03, 22.04it/s]
52% 74/143 [00:03<00:03, 21.98it/s]
54% 77/143 [00:03<00:02, 22.03it/s]
56% 80/143 [00:03<00:02, 22.04it/s]
58% 83/143 [00:03<00:02, 21.97it/s]
60% 86/143 [00:04<00:02, 22.03it/s]
62% 89/143 [00:04<00:02, 21.95it/s]
64% 92/143 [00:04<00:02, 22.05it/s]
66% 95/143 [00:04<00:02, 22.10it/s]
69% 98/143 [00:04<00:02, 22.02it/s]
71% 101/143 [00:04<00:01, 22.12it/s]
73% 104/143 [00:04<00:01, 22.18it/s]
75% 107/143 [00:04<00:01, 22.07it/s]
77% 110/143 [00:05<00:01, 22.16it/s]
79% 113/143 [00:05<00:01, 22.02it/s]
81% 116/143 [00:05<00:01, 22.12it/s]
83% 119/143 [00:05<00:01, 22.17it/s]
85% 122/143 [00:05<00:00, 22.04it/s]
87% 125/143 [00:05<00:00, 22.11it/s]
90% 128/143 [00:05<00:00, 22.20it/s]
92% 131/143 [00:06<00:00, 22.07it/s]
94% 134/143 [00:06<00:00, 22.12it/s]
96% 137/143 [00:06<00:00, 22.01it/s]
98% 140/143 [00:06<00:00, 22.06it/s]
100% 143/143 [00:06<00:00, 19.78it/s]
{'eval_loss': 4.463651657104492, 'eval_runtime': 6.8792, 'eval_samples_per_second': 20.787, 'eval_steps_per_second': 20.787, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'epoch': 2.0}
67% 678/1016 [19:41<07:31, 1.33s/it]
100% 143/143 [00:06<00:00, 19.78it/s]
 67% 679/1016 [19:44<21:05, 3.75s/it] 67% 680/1016 [19:45<17:24, 3.11s/it] 67% 681/1016 [19:47<14:50, 2.66s/it] 67% 682/1016 [19:49<13:13, 2.37s/it] 67% 683/1016 [19:51<13:24, 2.42s/it] 67% 684/1016 [19:53<12:18, 2.22s/it] 67% 685/1016 [19:55<11:33, 2.10s/it] 68% 686/1016 [19:57<11:02, 2.01s/it] 68% 687/1016 [19:58<10:35, 1.93s/it] 68% 688/1016 [20:00<10:16, 1.88s/it] 68% 689/1016 [20:02<10:03, 1.85s/it] 68% 690/1016 [20:04<09:53, 1.82s/it] 68% 691/1016 [20:05<09:45, 1.80s/it] 68% 692/1016 [20:07<09:45, 1.81s/it] 68% 693/1016 [20:09<09:41, 1.80s/it] 68% 694/1016 [20:11<09:35, 1.79s/it] 68% 695/1016 [20:13<09:30, 1.78s/it] 69% 696/1016 [20:14<09:27, 1.77s/it] 69% 697/1016 [20:16<09:24, 1.77s/it] 69% 698/1016 [20:18<09:24, 1.77s/it] 69% 699/1016 [20:20<09:24, 1.78s/it] 69% 700/1016 [20:22<10:05, 1.92s/it] {'loss': 4.294, 'grad_norm': 18.91043472290039, 'learning_rate': 4.512643260086751e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 492.11, 'total_tokens': 1847089, 'epoch': 2.07}
69% 700/1016 [20:22<10:05, 1.92s/it] 69% 701/1016 [20:24<09:36, 1.83s/it] 69% 702/1016 [20:25<09:28, 1.81s/it] 69% 703/1016 [20:27<09:22, 1.80s/it] 69% 704/1016 [20:29<09:19, 1.79s/it] 69% 705/1016 [20:31<09:08, 1.76s/it] 69% 706/1016 [20:32<08:59, 1.74s/it] 70% 707/1016 [20:34<08:46, 1.70s/it] 70% 708/1016 [20:36<08:39, 1.69s/it] 70% 709/1016 [20:37<08:33, 1.67s/it] 70% 710/1016 [20:39<08:26, 1.65s/it] 70% 711/1016 [20:40<08:20, 1.64s/it] 70% 712/1016 [20:42<08:16, 1.63s/it] 70% 713/1016 [20:44<08:15, 1.64s/it] 70% 714/1016 [20:45<08:13, 1.63s/it] 70% 715/1016 [20:47<08:10, 1.63s/it] 70% 716/1016 [20:48<08:06, 1.62s/it] 71% 717/1016 [20:51<08:47, 1.76s/it] 71% 718/1016 [20:52<08:30, 1.71s/it] 71% 719/1016 [20:54<08:19, 1.68s/it] 71% 720/1016 [20:55<08:13, 1.67s/it] 71% 721/1016 [20:57<08:07, 1.65s/it] 71% 722/1016 [20:59<08:02, 1.64s/it] 71% 723/1016 [21:00<07:58, 1.63s/it] 71% 724/1016 [21:02<07:55, 1.63s/it] 71% 725/1016 [21:04<07:53, 1.63s/it] 71% 726/1016 [21:05<07:49, 1.62s/it] 72% 727/1016 [21:07<07:47, 1.62s/it] 72% 728/1016 [21:08<07:46, 1.62s/it] 72% 729/1016 [21:10<07:43, 1.61s/it] 72% 730/1016 [21:12<07:41, 1.61s/it] 72% 731/1016 [21:13<07:38, 1.61s/it] 72% 732/1016 [21:15<07:35, 1.60s/it] 72% 733/1016 [21:16<07:34, 1.60s/it] 72% 734/1016 [21:18<07:32, 1.60s/it] 72% 735/1016 [21:20<07:31, 1.61s/it] 72% 736/1016 [21:21<07:30, 1.61s/it] 73% 737/1016 [21:23<07:27, 1.61s/it] 73% 738/1016 [21:24<07:25, 1.60s/it] 73% 739/1016 [21:26<07:24, 1.61s/it] 73% 740/1016 [21:28<07:22, 1.60s/it] 73% 741/1016 [21:29<07:21, 1.60s/it] 73% 742/1016 [21:31<07:20, 1.61s/it] 73% 743/1016 [21:33<08:05, 1.78s/it] 73% 744/1016 [21:35<07:51, 1.73s/it] 73% 745/1016 [21:36<07:40, 1.70s/it] 73% 746/1016 [21:38<07:32, 1.68s/it] 74% 747/1016 [21:39<07:24, 1.65s/it] 74% 748/1016 [21:41<07:19, 1.64s/it] 74% 749/1016 [21:43<07:17, 1.64s/it] 74% 750/1016 [21:44<07:15, 1.64s/it] {'loss': 4.2329, 'grad_norm': 22.407855987548828, 'learning_rate': 3.279350399124066e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1508.42, 'total_tokens': 1969141, 'epoch': 2.21}
74% 750/1016 [21:44<07:15, 1.64s/it] 74% 751/1016 [21:46<07:13, 1.64s/it] 74% 752/1016 [21:48<07:09, 1.63s/it] 74% 753/1016 [21:49<07:07, 1.62s/it] 74% 754/1016 [21:51<07:07, 1.63s/it] 74% 755/1016 [21:53<07:14, 1.67s/it] 74% 756/1016 [21:54<07:20, 1.69s/it] 75% 757/1016 [21:56<07:27, 1.73s/it] 75% 758/1016 [21:58<07:22, 1.71s/it] 75% 759/1016 [21:59<07:14, 1.69s/it] 75% 760/1016 [22:01<07:05, 1.66s/it] 75% 761/1016 [22:03<07:03, 1.66s/it] 75% 762/1016 [22:04<06:58, 1.65s/it] 75% 763/1016 [22:06<06:53, 1.63s/it] 75% 764/1016 [22:08<07:28, 1.78s/it] 75% 765/1016 [22:10<07:15, 1.73s/it] 75% 766/1016 [22:11<07:04, 1.70s/it] 75% 767/1016 [22:13<06:56, 1.67s/it] 76% 768/1016 [22:15<06:50, 1.65s/it] 76% 769/1016 [22:16<06:44, 1.64s/it] 76% 770/1016 [22:18<06:40, 1.63s/it] 76% 771/1016 [22:19<06:38, 1.63s/it] 76% 772/1016 [22:21<06:36, 1.63s/it] 76% 773/1016 [22:23<06:33, 1.62s/it] 76% 774/1016 [22:24<06:32, 1.62s/it] 76% 775/1016 [22:26<06:29, 1.61s/it] 76% 776/1016 [22:27<06:27, 1.62s/it] 76% 777/1016 [22:29<06:24, 1.61s/it] 77% 778/1016 [22:31<06:22, 1.61s/it] 77% 779/1016 [22:32<06:22, 1.61s/it] 77% 780/1016 [22:34<06:21, 1.62s/it] 77% 781/1016 [22:36<06:53, 1.76s/it] 77% 782/1016 [22:38<06:40, 1.71s/it] 77% 783/1016 [22:39<06:30, 1.67s/it] 77% 784/1016 [22:41<06:22, 1.65s/it] 77% 785/1016 [22:42<06:16, 1.63s/it] 77% 786/1016 [22:44<06:12, 1.62s/it] 77% 787/1016 [22:46<06:10, 1.62s/it] 78% 788/1016 [22:47<06:07, 1.61s/it] 78% 789/1016 [22:49<06:05, 1.61s/it] 78% 790/1016 [22:50<06:03, 1.61s/it] 78% 791/1016 [22:52<06:00, 1.60s/it] 78% 792/1016 [22:54<05:58, 1.60s/it] 78% 793/1016 [22:55<05:57, 1.60s/it] 78% 794/1016 [22:57<05:56, 1.61s/it] 78% 795/1016 [22:58<05:54, 1.60s/it] 78% 796/1016 [23:00<05:52, 1.60s/it] 78% 797/1016 [23:02<05:50, 1.60s/it] 79% 798/1016 [23:03<05:47, 1.59s/it] 79% 799/1016 [23:05<05:47, 1.60s/it] 79% 800/1016 [23:06<05:45, 1.60s/it] {'loss': 4.2312, 'grad_norm': 22.668535232543945, 'learning_rate': 2.209578150224645e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1531.3, 'total_tokens': 2090972, 'epoch': 2.36}
79% 800/1016 [23:06<05:45, 1.60s/it][2025-12-16 10:19:14,749] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-800
79% 801/1016 [23:20<19:07, 5.34s/it] 79% 802/1016 [23:22<15:14, 4.27s/it] 79% 803/1016 [23:24<12:28, 3.51s/it] 79% 804/1016 [23:26<10:35, 3.00s/it] 79% 805/1016 [23:27<09:13, 2.62s/it] 79% 806/1016 [23:29<08:16, 2.36s/it] 79% 807/1016 [23:32<08:10, 2.35s/it] 80% 808/1016 [23:33<07:32, 2.18s/it] 80% 809/1016 [23:35<07:05, 2.05s/it] 80% 810/1016 [23:37<06:45, 1.97s/it] 80% 811/1016 [23:39<06:30, 1.91s/it] 80% 812/1016 [23:40<06:19, 1.86s/it] 80% 813/1016 [23:42<06:11, 1.83s/it] 80% 814/1016 [23:44<06:07, 1.82s/it] 80% 815/1016 [23:46<06:03, 1.81s/it] 80% 816/1016 [23:47<05:58, 1.79s/it] 80% 817/1016 [23:49<05:53, 1.78s/it] 81% 818/1016 [23:51<05:49, 1.76s/it] 81% 819/1016 [23:53<05:41, 1.73s/it] 81% 820/1016 [23:54<05:31, 1.69s/it] 81% 821/1016 [23:56<05:32, 1.71s/it] 81% 822/1016 [23:58<05:34, 1.73s/it] 81% 823/1016 [23:59<05:34, 1.73s/it] 81% 824/1016 [24:01<05:31, 1.73s/it] 81% 825/1016 [24:03<05:25, 1.70s/it] 81% 826/1016 [24:04<05:17, 1.67s/it] 81% 827/1016 [24:06<05:14, 1.67s/it] 81% 828/1016 [24:08<05:40, 1.81s/it] 82% 829/1016 [24:10<05:27, 1.75s/it] 82% 830/1016 [24:11<05:17, 1.71s/it] 82% 831/1016 [24:13<05:11, 1.68s/it] 82% 832/1016 [24:15<05:05, 1.66s/it] 82% 833/1016 [24:16<05:00, 1.64s/it] 82% 834/1016 [24:18<04:55, 1.63s/it] 82% 835/1016 [24:19<04:52, 1.62s/it] 82% 836/1016 [24:21<04:51, 1.62s/it] 82% 837/1016 [24:23<04:50, 1.62s/it] 82% 838/1016 [24:24<04:47, 1.61s/it] 83% 839/1016 [24:26<04:45, 1.61s/it] 83% 840/1016 [24:28<04:43, 1.61s/it] 83% 841/1016 [24:29<04:41, 1.61s/it] 83% 842/1016 [24:31<04:40, 1.61s/it] 83% 843/1016 [24:32<04:40, 1.62s/it] 83% 844/1016 [24:34<04:38, 1.62s/it] 83% 845/1016 [24:36<05:00, 1.76s/it] 83% 846/1016 [24:38<04:51, 1.71s/it] 83% 847/1016 [24:39<04:44, 1.68s/it] 83% 848/1016 [24:41<04:39, 1.66s/it] 84% 849/1016 [24:42<04:33, 1.64s/it] 84% 850/1016 [24:44<04:29, 1.62s/it] {'loss': 4.222, 'grad_norm': 18.202590942382812, 'learning_rate': 1.3293552194358238e-05, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1544.96, 'total_tokens': 2213097, 'epoch': 2.51}
84% 850/1016 [24:44<04:29, 1.62s/it] 84% 851/1016 [24:46<04:28, 1.62s/it] 84% 852/1016 [24:47<04:25, 1.62s/it] 84% 853/1016 [24:49<04:23, 1.62s/it] 84% 854/1016 [24:51<04:22, 1.62s/it] 84% 855/1016 [24:52<04:19, 1.61s/it] 84% 856/1016 [24:54<04:17, 1.61s/it] 84% 857/1016 [24:55<04:15, 1.61s/it] 84% 858/1016 [24:57<04:13, 1.61s/it] 85% 859/1016 [24:59<04:13, 1.62s/it] 85% 860/1016 [25:00<04:10, 1.61s/it] 85% 861/1016 [25:02<04:09, 1.61s/it] 85% 862/1016 [25:03<04:07, 1.61s/it] 85% 863/1016 [25:05<04:04, 1.60s/it] 85% 864/1016 [25:07<04:03, 1.60s/it] 85% 865/1016 [25:08<04:01, 1.60s/it] 85% 866/1016 [25:10<04:01, 1.61s/it] 85% 867/1016 [25:11<04:00, 1.61s/it] 85% 868/1016 [25:13<03:58, 1.61s/it] 86% 869/1016 [25:15<03:57, 1.61s/it] 86% 870/1016 [25:16<03:55, 1.61s/it] 86% 871/1016 [25:18<04:14, 1.76s/it] 86% 872/1016 [25:20<04:06, 1.71s/it] 86% 873/1016 [25:22<04:02, 1.70s/it] 86% 874/1016 [25:23<04:00, 1.69s/it] 86% 875/1016 [25:25<04:00, 1.71s/it] 86% 876/1016 [25:27<04:01, 1.72s/it] 86% 877/1016 [25:29<04:00, 1.73s/it] 86% 878/1016 [25:30<03:54, 1.70s/it] 87% 879/1016 [25:32<03:50, 1.68s/it] 87% 880/1016 [25:33<03:47, 1.67s/it] 87% 881/1016 [25:35<03:45, 1.67s/it] 87% 882/1016 [25:37<03:41, 1.65s/it] 87% 883/1016 [25:38<03:37, 1.64s/it] 87% 884/1016 [25:40<03:35, 1.63s/it] 87% 885/1016 [25:42<03:33, 1.63s/it] 87% 886/1016 [25:43<03:30, 1.62s/it] 87% 887/1016 [25:45<03:29, 1.63s/it] 87% 888/1016 [25:46<03:28, 1.63s/it] 88% 889/1016 [25:48<03:25, 1.62s/it] 88% 890/1016 [25:50<03:23, 1.62s/it] 88% 891/1016 [25:51<03:21, 1.61s/it] 88% 892/1016 [25:53<03:38, 1.76s/it] 88% 893/1016 [25:55<03:30, 1.71s/it] 88% 894/1016 [25:57<03:25, 1.68s/it] 88% 895/1016 [25:58<03:21, 1.67s/it] 88% 896/1016 [26:00<03:18, 1.65s/it] 88% 897/1016 [26:01<03:15, 1.64s/it] 88% 898/1016 [26:03<03:12, 1.63s/it] 88% 899/1016 [26:05<03:10, 1.63s/it] 89% 900/1016 [26:06<03:08, 1.62s/it] {'loss': 4.2271, 'grad_norm': 18.34627914428711, 'learning_rate': 6.600983746212319e-06, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1515.29, 'total_tokens': 2335026, 'epoch': 2.66}
89% 900/1016 [26:06<03:08, 1.62s/it] 89% 901/1016 [26:08<03:06, 1.62s/it] 89% 902/1016 [26:10<03:04, 1.62s/it] 89% 903/1016 [26:11<03:03, 1.62s/it] 89% 904/1016 [26:13<03:01, 1.62s/it] 89% 905/1016 [26:14<02:59, 1.62s/it] 89% 906/1016 [26:16<02:58, 1.62s/it] 89% 907/1016 [26:18<02:56, 1.62s/it] 89% 908/1016 [26:19<02:54, 1.61s/it] 89% 909/1016 [26:21<02:52, 1.61s/it] 90% 910/1016 [26:23<03:07, 1.77s/it] 90% 911/1016 [26:25<03:01, 1.73s/it] 90% 912/1016 [26:26<02:55, 1.69s/it] 90% 913/1016 [26:28<02:51, 1.67s/it] 90% 914/1016 [26:29<02:48, 1.65s/it] 90% 915/1016 [26:31<02:46, 1.65s/it] 90% 916/1016 [26:33<02:44, 1.64s/it] 90% 917/1016 [26:34<02:43, 1.65s/it] 90% 918/1016 [26:36<02:40, 1.64s/it] 90% 919/1016 [26:38<02:37, 1.62s/it] 91% 920/1016 [26:39<02:35, 1.62s/it] 91% 921/1016 [26:41<02:33, 1.61s/it] 91% 922/1016 [26:42<02:31, 1.61s/it] 91% 923/1016 [26:44<02:29, 1.61s/it] 91% 924/1016 [26:46<02:28, 1.61s/it] 91% 925/1016 [26:47<02:27, 1.63s/it] 91% 926/1016 [26:49<02:26, 1.62s/it] 91% 927/1016 [26:51<02:24, 1.62s/it] 91% 928/1016 [26:53<02:40, 1.83s/it] 91% 929/1016 [26:55<02:36, 1.80s/it] 92% 930/1016 [26:56<02:34, 1.79s/it] 92% 931/1016 [26:58<02:32, 1.80s/it] 92% 932/1016 [27:00<02:30, 1.79s/it] 92% 933/1016 [27:02<02:27, 1.78s/it] 92% 934/1016 [27:03<02:25, 1.78s/it] 92% 935/1016 [27:05<02:23, 1.77s/it] 92% 936/1016 [27:07<02:21, 1.77s/it] 92% 937/1016 [27:09<02:19, 1.76s/it] 92% 938/1016 [27:11<02:19, 1.78s/it] 92% 939/1016 [27:12<02:17, 1.78s/it] 93% 940/1016 [27:14<02:15, 1.78s/it] 93% 941/1016 [27:16<02:12, 1.77s/it] 93% 942/1016 [27:18<02:10, 1.77s/it] 93% 943/1016 [27:19<02:08, 1.76s/it] 93% 944/1016 [27:21<02:06, 1.76s/it] 93% 945/1016 [27:23<02:03, 1.75s/it] 93% 946/1016 [27:24<01:59, 1.71s/it] 93% 947/1016 [27:26<01:58, 1.71s/it] 93% 948/1016 [27:28<01:56, 1.72s/it] 93% 949/1016 [27:30<01:55, 1.73s/it] 94% 950/1016 [27:31<01:53, 1.72s/it] {'loss': 4.2248, 'grad_norm': 21.71125030517578, 'learning_rate': 2.1809135253115565e-06, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1450.1, 'total_tokens': 2458575, 'epoch': 2.8}
94% 950/1016 [27:31<01:53, 1.72s/it] 94% 951/1016 [27:33<01:50, 1.70s/it] 94% 952/1016 [27:35<01:47, 1.67s/it] 94% 953/1016 [27:36<01:44, 1.66s/it] 94% 954/1016 [27:38<01:51, 1.81s/it] 94% 955/1016 [27:40<01:46, 1.75s/it] 94% 956/1016 [27:42<01:42, 1.70s/it] 94% 957/1016 [27:43<01:38, 1.67s/it] 94% 958/1016 [27:45<01:36, 1.66s/it] 94% 959/1016 [27:46<01:33, 1.65s/it] 94% 960/1016 [27:48<01:31, 1.64s/it] 95% 961/1016 [27:50<01:29, 1.63s/it] 95% 962/1016 [27:51<01:27, 1.62s/it] 95% 963/1016 [27:53<01:25, 1.61s/it] 95% 964/1016 [27:54<01:23, 1.61s/it] 95% 965/1016 [27:56<01:21, 1.61s/it] 95% 966/1016 [27:58<01:20, 1.61s/it] 95% 967/1016 [27:59<01:19, 1.62s/it] 95% 968/1016 [28:01<01:17, 1.62s/it] 95% 969/1016 [28:03<01:15, 1.61s/it] 95% 970/1016 [28:04<01:14, 1.62s/it] 96% 971/1016 [28:06<01:12, 1.61s/it] 96% 972/1016 [28:07<01:10, 1.61s/it] 96% 973/1016 [28:09<01:09, 1.61s/it] 96% 974/1016 [28:11<01:14, 1.77s/it] 96% 975/1016 [28:13<01:10, 1.73s/it] 96% 976/1016 [28:14<01:07, 1.69s/it] 96% 977/1016 [28:16<01:05, 1.67s/it] 96% 978/1016 [28:18<01:02, 1.65s/it] 96% 979/1016 [28:19<01:00, 1.64s/it] 96% 980/1016 [28:21<00:58, 1.63s/it] 97% 981/1016 [28:22<00:56, 1.63s/it] 97% 982/1016 [28:24<00:55, 1.63s/it] 97% 983/1016 [28:26<00:53, 1.62s/it] 97% 984/1016 [28:27<00:51, 1.62s/it] 97% 985/1016 [28:29<00:49, 1.61s/it] 97% 986/1016 [28:30<00:48, 1.60s/it] 97% 987/1016 [28:32<00:46, 1.60s/it] 97% 988/1016 [28:34<00:44, 1.60s/it] 97% 989/1016 [28:35<00:43, 1.61s/it] 97% 990/1016 [28:37<00:41, 1.60s/it] 98% 991/1016 [28:38<00:39, 1.60s/it] 98% 992/1016 [28:41<00:41, 1.74s/it] 98% 993/1016 [28:42<00:39, 1.70s/it] 98% 994/1016 [28:44<00:36, 1.67s/it] 98% 995/1016 [28:45<00:34, 1.65s/it] 98% 996/1016 [28:47<00:32, 1.65s/it] 98% 997/1016 [28:49<00:31, 1.64s/it] 98% 998/1016 [28:50<00:29, 1.63s/it] 98% 999/1016 [28:52<00:27, 1.62s/it] 98% 1000/1016 [28:53<00:26, 1.64s/it] {'loss': 4.2186, 'grad_norm': 16.12077522277832, 'learning_rate': 1.4088658024622448e-07, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'tokens_per_second_per_gpu': 1481.33, 'total_tokens': 2582586, 'epoch': 2.95}
98% 1000/1016 [28:53<00:26, 1.64s/it][2025-12-16 10:25:01,884] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-1000
99% 1001/1016 [29:04<01:05, 4.39s/it] 99% 1002/1016 [29:06<00:49, 3.57s/it] 99% 1003/1016 [29:08<00:38, 2.98s/it] 99% 1004/1016 [29:09<00:30, 2.56s/it] 99% 1005/1016 [29:11<00:25, 2.28s/it] 99% 1006/1016 [29:12<00:20, 2.09s/it] 99% 1007/1016 [29:14<00:17, 1.94s/it] 99% 1008/1016 [29:16<00:14, 1.84s/it] 99% 1009/1016 [29:17<00:12, 1.76s/it] 99% 1010/1016 [29:19<00:10, 1.72s/it] 100% 1011/1016 [29:20<00:08, 1.68s/it] 100% 1012/1016 [29:22<00:06, 1.66s/it] 100% 1013/1016 [29:24<00:04, 1.66s/it] 100% 1014/1016 [29:25<00:03, 1.64s/it] 100% 1015/1016 [29:27<00:01, 1.62s/it] 100% 1016/1016 [29:28<00:00, 1.61s/it][2025-12-16 10:25:36,817] [INFO] [axolotl.core.trainers.base._save:676] [PID:25550] Saving model checkpoint to /content/finetuned_model/checkpoint-1016
{'train_runtime': 1778.0394, 'train_samples_per_second': 4.571, 'train_steps_per_second': 0.571, 'train_loss': 4.676482853927012, 'memory/max_active (GiB)': 7.2, 'memory/max_allocated (GiB)': 7.2, 'memory/device_reserved (GiB)': 7.62, 'epoch': 3.0}
100% 1016/1016 [29:38<00:00, 1.61s/it] 100% 1016/1016 [29:38<00:00, 1.75s/it]
[2025-12-16 10:25:46,033] [INFO] [axolotl.train.save_trained_model:233] [PID:25550] Training completed! Saving trained model to /content/finetuned_model.
[2025-12-16 10:25:49,446] [INFO] [axolotl.train.save_trained_model:351] [PID:25550] Model successfully saved to /content/finetuned_model