Upload folder using huggingface_hub

97e1511 verified about 2 months ago

140 kB

	[2025-12-27 21:18:07,941] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:122677] bf16 support detected, enabling for this configuration.
	config.json: 0.00B [00:00, ?B/s] config.json: 1.54kB [00:00, 6.02MB/s]
	[2025-12-27 21:18:08,103] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:122677] baseline 0.000GB ()
	[2025-12-27 21:18:08,106] [INFO] [axolotl.cli.config.load_cfg:248] [PID:122677] config:
	{
	"activation_offloading": false,
	"adapter": "lora",
	"axolotl_config_path": "config.yaml",
	"base_model": "BKM1804/affine-he-CIVICbeatPORSCHE",
	"base_model_config": "BKM1804/affine-he-CIVICbeatPORSCHE",
	"batch_size": 128,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_90",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"context_parallel_size": 1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_processes": 18,
	"datasets": [
	{
	"chat_template": "tokenizer_default",
	"field_messages": "messages",
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "/workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl",
	"split": "train",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.7.1"
	},
	"eval_batch_size": 2,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"fp16": false,
	"gradient_accumulation_steps": 64,
	"gradient_checkpointing": false,
	"include_tkps": true,
	"learning_rate": 2e-06,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": true,
	"local_rank": 0,
	"lora_alpha": 64,
	"lora_dropout": 0.05,
	"lora_r": 32,
	"lora_target_modules": [
	"q_proj",
	"v_proj",
	"k_proj",
	"o_proj",
	"gate_proj",
	"down_proj",
	"up_proj"
	],
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "cosine",
	"mean_resizing_embeddings": false,
	"micro_batch_size": 2,
	"model_config_type": "qwen3",
	"num_epochs": 3.0,
	"optimizer": "adamw_bnb_8bit",
	"output_dir": "./outputs/mymodel",
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": false,
	"save_safetensors": true,
	"sequence_len": 4096,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "BKM1804/affine-he-CIVICbeatPORSCHE",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"use_ray": false,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"weight_decay": 0.0,
	"world_size": 1
	}
	tokenizer_config.json: 0.00B [00:00, ?B/s] tokenizer_config.json: 5.40kB [00:00, 22.8MB/s]
	vocab.json: 0.00B [00:00, ?B/s] vocab.json: 32.8kB [00:00, 297kB/s] vocab.json: 1.66MB [00:00, 7.30MB/s] vocab.json: 2.78MB [00:00, 10.2MB/s]
	merges.txt: 0.00B [00:00, ?B/s] merges.txt: 43.4kB [00:00, 357kB/s] merges.txt: 1.67MB [00:00, 8.66MB/s]
	tokenizer.json: 0%\| \| 0.00/11.4M [00:00<?, ?B/s] tokenizer.json: 3%\|███▎ \| 329k/11.4M [00:00<00:16, 682kB/s] tokenizer.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 11.4M/11.4M [00:00<00:00, 25.0MB/s] tokenizer.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 11.4M/11.4M [00:00<00:00, 19.1MB/s]
	added_tokens.json: 0%\| \| 0.00/707 [00:00<?, ?B/s] added_tokens.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 707/707 [00:00<00:00, 5.99MB/s]
	special_tokens_map.json: 0%\| \| 0.00/613 [00:00<?, ?B/s] special_tokens_map.json: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 613/613 [00:00<00:00, 4.94MB/s]
	chat_template.jinja: 0.00B [00:00, ?B/s] chat_template.jinja: 4.93kB [00:00, 23.1MB/s]
	[2025-12-27 21:18:10,826] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <\|im_end\|>
	[2025-12-27 21:18:10,827] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None
	[2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <\|endoftext\|>
	[2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None
	[2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:122677] Unable to find prepared dataset in last_run_prepared/f6b60198703671e2d2150636511428c1
	[2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:122677] Loading raw datasets...
	[2025-12-27 21:18:10,829] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:122677] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
	[2025-12-27 21:18:10,933] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:122677] Loading dataset: /workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl with base_type: chat_template and prompt_style: None
	[2025-12-27 21:18:10,935] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:122677] Using chat template:
	---
	{%- set ns = namespace(last_query_index=-1) %}
	{%- for message in messages %}
	{%- if message.role == "user" %}
	{%- set ns.last_query_index = loop.index0 %}
	{%- endif %}
	{%- endfor %}
	{%- if tools %}
	{{- '<\|im_start\|>system\n' }}
	{%- if messages[0].role == 'system' %}
	{{- messages[0].content + '\n\n' }}
	{%- endif %}
	{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
	{%- for tool in tools %}
	{{- "\n" }}
	{{- tool \| tojson }}
	{%- endfor %}
	{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><\|im_end\|>\n" }}
	{%- else %}
	{%- if messages[0].role == 'system' %}
	{{- '<\|im_start\|>system\n' + messages[0].content + '<\|im_end\|>\n' }}
	{%- endif %}
	{%- endif %}
	{%- for message in messages %}
	{%- if message.content is string %}
	{%- set content = message.content %}
	{%- else %}
	{%- set content = '' %}
	{%- endif %}
	{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
	{{- '<\|im_start\|>' + message.role + '\n' + content + '<\|im_end\|>' + '\n' }}
	{%- elif message.role == "assistant" %}
	{%- set has_loss = (message.loss is defined and message.loss) %}
	{%- set reasoning_content = '' %}
	{%- if message.reasoning_content is string %}
	{%- set reasoning_content = message.reasoning_content %}
	{%- else %}
	{%- if '</think>' in content %}
	{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
	{%- set content = content.split('</think>')[-1].lstrip('\n') %}
	{%- endif %}
	{%- endif %}
	{{- '<\|im_start\|>' + message.role + '\n' }}
	{%- if has_loss -%}
	{%- generation -%}
	{%- if loop.index0 > ns.last_query_index and reasoning_content %}
	{{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
	{%- else %}
	{{- content }}
	{%- endif %}
	{%- if message.tool_calls %}
	{%- for tool_call in message.tool_calls %}
	{%- if (loop.first and content) or (not loop.first) %}
	{{- '\n' }}
	{%- endif %}
	{%- if tool_call.function %}
	{%- set tool_call = tool_call.function %}
	{%- endif %}
	{{- '<tool_call>\n{"name": "' }}
	{{- tool_call.name }}
	{{- '", "arguments": ' }}
	{%- if tool_call.arguments is string %}
	{{- tool_call.arguments }}
	{%- else %}
	{{- tool_call.arguments \| tojson }}
	{%- endif %}
	{{- '}\n</tool_call>' }}
	{%- endfor %}
	{%- endif %}
	{{- '<\|im_end\|>' }}
	{%- endgeneration -%}
	{{- '\n' }}
	{%- else -%}
	{%- if loop.index0 > ns.last_query_index and reasoning_content %}
	{{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
	{%- else %}
	{{- content }}
	{%- endif %}
	{%- if message.tool_calls %}
	{%- for tool_call in message.tool_calls %}
	{%- if (loop.first and content) or (not loop.first) %}
	{{- '\n' }}
	{%- endif %}
	{%- if tool_call.function %}
	{%- set tool_call = tool_call.function %}
	{%- endif %}
	{{- '<tool_call>\n{"name": "' }}
	{{- tool_call.name }}
	{{- '", "arguments": ' }}
	{%- if tool_call.arguments is string %}
	{{- tool_call.arguments }}
	{%- else %}
	{{- tool_call.arguments \| tojson }}
	{%- endif %}
	{{- '}\n</tool_call>' }}
	{%- endfor %}
	{%- endif %}
	{{- '<\|im_end\|>\n' }}
	{%- endif %}
	{%- elif message.role == "tool" %}
	{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
	{{- '<\|im_start\|>user' }}
	{%- endif %}
	{{- '\n<tool_response>\n' }}
	{{- content }}
	{{- '\n</tool_response>' }}
	{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
	{{- '<\|im_end\|>\n' }}
	{%- endif %}
	{%- endif %}
	{%- endfor %}
	{%- if add_generation_prompt %}
	{{- '<\|im_start\|>assistant\n' }}
	{%- endif %}
	---
	Tokenizing Prompts (num_proc=18): 0%\| \| 0/3494 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=18): 6%\|████▊ \| 195/3494 [00:01<00:18, 179.32 examples/s] Tokenizing Prompts (num_proc=18): 11%\|█████████▋ \| 390/3494 [00:01<00:08, 380.73 examples/s] Tokenizing Prompts (num_proc=18): 17%\|██████████████▌ \| 584/3494 [00:01<00:04, 583.45 examples/s] Tokenizing Prompts (num_proc=18): 28%\|███████████████████████▉ \| 972/3494 [00:01<00:02, 1112.10 examples/s] Tokenizing Prompts (num_proc=18): 39%\|█████████████████████████████████ \| 1360/3494 [00:01<00:01, 1564.38 examples/s] Tokenizing Prompts (num_proc=18): 50%\|██████████████████████████████████████████▌ \| 1748/3494 [00:01<00:00, 2022.08 examples/s] Tokenizing Prompts (num_proc=18): 61%\|███████████████████████████████████████████████████▉ \| 2136/3494 [00:01<00:00, 1942.13 examples/s] Tokenizing Prompts (num_proc=18): 89%\|███████████████████████████████████████████████████████████████████████████▌ \| 3106/3494 [00:02<00:00, 3193.48 examples/s] Tokenizing Prompts (num_proc=18): 100%\|█████████████████████████████████████████████████████████████████████████████████████\| 3494/3494 [00:02<00:00, 2646.33 examples/s] Tokenizing Prompts (num_proc=18): 100%\|█████████████████████████████████████████████████████████████████████████████████████\| 3494/3494 [00:02<00:00, 1446.93 examples/s]
	[2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:122677] min_input_len: 64
	[2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:122677] max_input_len: 24840
	Dropping Long Sequences (>4096) (num_proc=18): 0%\| \| 0/3494 [00:00<?, ? examples/s] Dropping Long Sequences (>4096) (num_proc=18): 6%\|████▏ \| 195/3494 [00:00<00:05, 564.88 examples/s] Dropping Long Sequences (>4096) (num_proc=18): 100%\|████████████████████████████████████████████████████████████████████████\| 3494/3494 [00:00<00:00, 6342.01 examples/s]
	[2025-12-27 21:18:14,100] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:122677] Dropped 49 samples from dataset
	Saving the dataset (0/13 shards): 0%\| \| 0/3445 [00:00<?, ? examples/s] Saving the dataset (0/13 shards): 8%\|██████▋ \| 265/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (1/13 shards): 8%\|██████▋ \| 265/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (2/13 shards): 15%\|█████████████▍ \| 530/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (3/13 shards): 23%\|████████████████████ \| 795/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (4/13 shards): 31%\|██████████████████████████▍ \| 1060/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (5/13 shards): 38%\|█████████████████████████████████ \| 1325/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (6/13 shards): 46%\|███████████████████████████████████████▋ \| 1590/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (7/13 shards): 54%\|██████████████████████████████████████████████▎ \| 1855/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (8/13 shards): 62%\|████████████████████████████████████████████████████▉ \| 2120/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (9/13 shards): 69%\|███████████████████████████████████████████████████████████▌ \| 2385/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (10/13 shards): 77%\|█████████████████████████████████████████████████████████████████▍ \| 2650/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (11/13 shards): 85%\|███████████████████████████████████████████████████████████████████████▉ \| 2915/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (12/13 shards): 92%\|██████████████████████████████████████████████████████████████████████████████▍ \| 3180/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (13/13 shards): 100%\|█████████████████████████████████████████████████████████████████████████████████████\| 3445/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (13/13 shards): 100%\|████████████████████████████████████████████████████████████████████████████████████\| 3445/3445 [00:00<00:00, 8496.02 examples/s]
	[2025-12-27 21:18:14,663] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:122677] total_num_tokens: 1_863_059
	[2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:122677] `total_supervised_tokens: 888_884`
	[2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:122677] total_num_steps: 81
	[2025-12-27 21:18:14,696] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:122677] Maximum number of steps set at 81
	[2025-12-27 21:18:14,722] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:122677] Loading tokenizer... BKM1804/affine-he-CIVICbeatPORSCHE
	[2025-12-27 21:18:15,206] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <\|im_end\|>
	[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None
	[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <\|endoftext\|>
	[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None
	[2025-12-27 21:18:15,208] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:122677] Loading model
	[2025-12-27 21:18:15,257] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:122677] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2025-12-27 21:18:15,258] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:122677] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	model.safetensors.index.json: 0.00B [00:00, ?B/s] model.safetensors.index.json: 32.9kB [00:00, 88.3MB/s]
	model-00001-of-00002.safetensors: 0%\| \| 0.00/4.97G [00:00<?, ?B/s] model-00001-of-00002.safetensors: 0%\| \| 680k/4.97G [00:00<2:00:53, 685kB/s] model-00001-of-00002.safetensors: 0%\| \| 2.84M/4.97G [00:01<39:57, 2.07MB/s] model-00001-of-00002.safetensors: 1%\|█▎ \| 70.5M/4.97G [00:01<01:22, 59.1MB/s] model-00001-of-00002.safetensors: 3%\|██▌ \| 138M/4.97G [00:02<00:49, 98.0MB/s] model-00001-of-00002.safetensors: 4%\|███▉ \| 205M/4.97G [00:02<00:33, 141MB/s] model-00001-of-00002.safetensors: 5%\|█████▏ \| 272M/4.97G [00:02<00:25, 185MB/s] model-00001-of-00002.safetensors: 7%\|██████▍ \| 339M/4.97G [00:02<00:20, 222MB/s] model-00001-of-00002.safetensors: 8%\|███████▊ \| 406M/4.97G [00:03<00:19, 235MB/s] model-00001-of-00002.safetensors: 10%\|█████████ \| 473M/4.97G [00:03<00:20, 225MB/s] model-00001-of-00002.safetensors: 11%\|██████████▎ \| 540M/4.97G [00:03<00:20, 215MB/s] model-00001-of-00002.safetensors: 12%\|███████████▌ \| 607M/4.97G [00:04<00:20, 210MB/s] model-00001-of-00002.safetensors: 14%\|████████████▉ \| 674M/4.97G [00:04<00:20, 209MB/s] model-00001-of-00002.safetensors: 15%\|██████████████▏ \| 741M/4.97G [00:04<00:17, 237MB/s] model-00001-of-00002.safetensors: 16%\|███████████████▍ \| 808M/4.97G [00:04<00:15, 262MB/s] model-00001-of-00002.safetensors: 18%\|████████████████▋ \| 875M/4.97G [00:04<00:14, 286MB/s] model-00001-of-00002.safetensors: 19%\|██████████████████ \| 942M/4.97G [00:05<00:13, 294MB/s] model-00001-of-00002.safetensors: 20%\|███████████████████ \| 1.01G/4.97G [00:05<00:15, 260MB/s] model-00001-of-00002.safetensors: 22%\|████████████████████▎ \| 1.08G/4.97G [00:05<00:16, 237MB/s] model-00001-of-00002.safetensors: 23%\|█████████████████████▋ \| 1.14G/4.97G [00:06<00:15, 251MB/s] model-00001-of-00002.safetensors: 24%\|██████████████████████▉ \| 1.21G/4.97G [00:06<00:14, 262MB/s] model-00001-of-00002.safetensors: 26%\|████████████████████████▏ \| 1.28G/4.97G [00:06<00:12, 289MB/s] model-00001-of-00002.safetensors: 27%\|█████████████████████████▍ \| 1.34G/4.97G [00:06<00:12, 283MB/s] model-00001-of-00002.safetensors: 28%\|██████████████████████████▋ \| 1.41G/4.97G [00:06<00:11, 298MB/s] model-00001-of-00002.safetensors: 30%\|███████████████████████████▉ \| 1.48G/4.97G [00:07<00:10, 334MB/s] model-00001-of-00002.safetensors: 31%\|█████████████████████████████▎ \| 1.55G/4.97G [00:07<00:10, 318MB/s] model-00001-of-00002.safetensors: 32%\|██████████████████████████████▌ \| 1.61G/4.97G [00:07<00:12, 265MB/s] model-00001-of-00002.safetensors: 34%\|███████████████████████████████▊ \| 1.68G/4.97G [00:07<00:11, 287MB/s] model-00001-of-00002.safetensors: 35%\|█████████████████████████████████ \| 1.75G/4.97G [00:08<00:11, 291MB/s] model-00001-of-00002.safetensors: 36%\|██████████████████████████████████▎ \| 1.81G/4.97G [00:08<00:10, 290MB/s] model-00001-of-00002.safetensors: 38%\|███████████████████████████████████▌ \| 1.88G/4.97G [00:08<00:10, 301MB/s] model-00001-of-00002.safetensors: 39%\|████████████████████████████████████▊ \| 1.95G/4.97G [00:08<00:09, 305MB/s] model-00001-of-00002.safetensors: 41%\|██████████████████████████████████████ \| 2.01G/4.97G [00:08<00:08, 333MB/s] model-00001-of-00002.safetensors: 42%\|███████████████████████████████████████▍ \| 2.08G/4.97G [00:09<00:08, 346MB/s] model-00001-of-00002.safetensors: 43%\|████████████████████████████████████████▋ \| 2.15G/4.97G [00:09<00:07, 353MB/s] model-00001-of-00002.safetensors: 45%\|█████████████████████████████████████████▉ \| 2.22G/4.97G [00:09<00:07, 356MB/s] model-00001-of-00002.safetensors: 46%\|███████████████████████████████████████████▏ \| 2.28G/4.97G [00:09<00:07, 343MB/s] model-00001-of-00002.safetensors: 47%\|████████████████████████████████████████████▍ \| 2.35G/4.97G [00:09<00:07, 340MB/s] model-00001-of-00002.safetensors: 49%\|█████████████████████████████████████████████▋ \| 2.42G/4.97G [00:10<00:07, 320MB/s] model-00001-of-00002.safetensors: 50%\|██████████████████████████████████████████████▉ \| 2.48G/4.97G [00:10<00:07, 336MB/s] model-00001-of-00002.safetensors: 51%\|████████████████████████████████████████████████▎ \| 2.55G/4.97G [00:10<00:07, 335MB/s] model-00001-of-00002.safetensors: 53%\|█████████████████████████████████████████████████▌ \| 2.62G/4.97G [00:10<00:07, 295MB/s] model-00001-of-00002.safetensors: 54%\|██████████████████████████████████████████████████▊ \| 2.68G/4.97G [00:10<00:07, 310MB/s] model-00001-of-00002.safetensors: 55%\|████████████████████████████████████████████████████ \| 2.75G/4.97G [00:11<00:07, 309MB/s] model-00001-of-00002.safetensors: 57%\|█████████████████████████████████████████████████████▎ \| 2.82G/4.97G [00:11<00:06, 308MB/s] model-00001-of-00002.safetensors: 58%\|██████████████████████████████████████████████████████▌ \| 2.89G/4.97G [00:11<00:06, 330MB/s] model-00001-of-00002.safetensors: 59%\|███████████████████████████████████████████████████████▉ \| 2.95G/4.97G [00:11<00:05, 344MB/s] model-00001-of-00002.safetensors: 61%\|█████████████████████████████████████████████████████████▏ \| 3.02G/4.97G [00:11<00:05, 355MB/s] model-00001-of-00002.safetensors: 62%\|██████████████████████████████████████████████████████████▍ \| 3.09G/4.97G [00:12<00:05, 360MB/s] model-00001-of-00002.safetensors: 64%\|███████████████████████████████████████████████████████████▋ \| 3.15G/4.97G [00:12<00:06, 282MB/s] model-00001-of-00002.safetensors: 65%\|████████████████████████████████████████████████████████████▉ \| 3.22G/4.97G [00:12<00:05, 312MB/s] model-00001-of-00002.safetensors: 66%\|██████████████████████████████████████████████████████████████▏ \| 3.29G/4.97G [00:12<00:05, 334MB/s] model-00001-of-00002.safetensors: 68%\|███████████████████████████████████████████████████████████████▌ \| 3.36G/4.97G [00:13<00:06, 253MB/s] model-00001-of-00002.safetensors: 69%\|████████████████████████████████████████████████████████████████▊ \| 3.43G/4.97G [00:13<00:05, 259MB/s] model-00001-of-00002.safetensors: 70%\|██████████████████████████████████████████████████████████████████ \| 3.49G/4.97G [00:13<00:05, 264MB/s] model-00001-of-00002.safetensors: 72%\|███████████████████████████████████████████████████████████████████▍ \| 3.56G/4.97G [00:13<00:05, 274MB/s] model-00001-of-00002.safetensors: 73%\|████████████████████████████████████████████████████████████████████▋ \| 3.63G/4.97G [00:14<00:05, 260MB/s] model-00001-of-00002.safetensors: 74%\|█████████████████████████████████████████████████████████████████████▉ \| 3.70G/4.97G [00:14<00:04, 256MB/s] model-00001-of-00002.safetensors: 76%\|███████████████████████████████████████████████████████████████████████▎ \| 3.77G/4.97G [00:14<00:04, 274MB/s] model-00001-of-00002.safetensors: 77%\|████████████████████████████████████████████████████████████████████████▍ \| 3.83G/4.97G [00:14<00:03, 296MB/s] model-00001-of-00002.safetensors: 78%\|█████████████████████████████████████████████████████████████████████████▊ \| 3.90G/4.97G [00:15<00:03, 317MB/s] model-00001-of-00002.safetensors: 80%\|███████████████████████████████████████████████████████████████████████████ \| 3.96G/4.97G [00:15<00:03, 319MB/s] model-00001-of-00002.safetensors: 81%\|████████████████████████████████████████████████████████████████████████████▎ \| 4.03G/4.97G [00:15<00:02, 324MB/s] model-00001-of-00002.safetensors: 83%\|█████████████████████████████████████████████████████████████████████████████▌ \| 4.10G/4.97G [00:15<00:02, 335MB/s] model-00001-of-00002.safetensors: 84%\|██████████████████████████████████████████████████████████████████████████████▊ \| 4.17G/4.97G [00:15<00:02, 321MB/s] model-00001-of-00002.safetensors: 85%\|████████████████████████████████████████████████████████████████████████████████ \| 4.23G/4.97G [00:16<00:02, 313MB/s] model-00001-of-00002.safetensors: 87%\|█████████████████████████████████████████████████████████████████████████████████▎ \| 4.30G/4.97G [00:16<00:02, 311MB/s] model-00001-of-00002.safetensors: 88%\|██████████████████████████████████████████████████████████████████████████████████▋ \| 4.37G/4.97G [00:16<00:02, 299MB/s] model-00001-of-00002.safetensors: 89%\|███████████████████████████████████████████████████████████████████████████████████▉ \| 4.43G/4.97G [00:16<00:01, 322MB/s] model-00001-of-00002.safetensors: 91%\|█████████████████████████████████████████████████████████████████████████████████████▏ \| 4.50G/4.97G [00:16<00:01, 294MB/s] model-00001-of-00002.safetensors: 92%\|██████████████████████████████████████████████████████████████████████████████████████▍ \| 4.57G/4.97G [00:17<00:01, 313MB/s] model-00001-of-00002.safetensors: 93%\|███████████████████████████████████████████████████████████████████████████████████████▋ \| 4.63G/4.97G [00:17<00:01, 311MB/s] model-00001-of-00002.safetensors: 95%\|████████████████████████████████████████████████████████████████████████████████████████▉ \| 4.70G/4.97G [00:17<00:00, 320MB/s] model-00001-of-00002.safetensors: 96%\|██████████████████████████████████████████████████████████████████████████████████████████▏ \| 4.77G/4.97G [00:17<00:00, 328MB/s] model-00001-of-00002.safetensors: 97%\|███████████████████████████████████████████████████████████████████████████████████████████▍ \| 4.83G/4.97G [00:17<00:00, 326MB/s] model-00001-of-00002.safetensors: 99%\|████████████████████████████████████████████████████████████████████████████████████████████▋ \| 4.90G/4.97G [00:18<00:00, 346MB/s] model-00001-of-00002.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████\| 4.97G/4.97G [00:18<00:00, 290MB/s] model-00001-of-00002.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████\| 4.97G/4.97G [00:18<00:00, 269MB/s]
	model-00002-of-00002.safetensors: 0%\| \| 0.00/3.08G [00:00<?, ?B/s] model-00002-of-00002.safetensors: 0%\| \| 547k/3.08G [00:00<1:11:35, 716kB/s] model-00002-of-00002.safetensors: 0%\| \| 1.87M/3.08G [00:01<25:07, 2.04MB/s] model-00002-of-00002.safetensors: 2%\|██ \| 68.4M/3.08G [00:01<00:48, 62.3MB/s] model-00002-of-00002.safetensors: 4%\|████▏ \| 135M/3.08G [00:01<00:28, 102MB/s] model-00002-of-00002.safetensors: 7%\|██████▎ \| 203M/3.08G [00:02<00:20, 142MB/s] model-00002-of-00002.safetensors: 9%\|████████▎ \| 270M/3.08G [00:02<00:17, 162MB/s] model-00002-of-00002.safetensors: 11%\|██████████▍ \| 337M/3.08G [00:02<00:16, 165MB/s] model-00002-of-00002.safetensors: 13%\|████████████▍ \| 404M/3.08G [00:03<00:13, 200MB/s] model-00002-of-00002.safetensors: 15%\|██████████████▌ \| 471M/3.08G [00:03<00:15, 167MB/s] model-00002-of-00002.safetensors: 17%\|████████████████▌ \| 538M/3.08G [00:04<00:15, 168MB/s] model-00002-of-00002.safetensors: 20%\|██████████████████▋ \| 605M/3.08G [00:04<00:16, 154MB/s] model-00002-of-00002.safetensors: 22%\|████████████████████▋ \| 672M/3.08G [00:04<00:13, 182MB/s] model-00002-of-00002.safetensors: 24%\|██████████████████████▊ \| 739M/3.08G [00:04<00:11, 210MB/s] model-00002-of-00002.safetensors: 26%\|████████████████████████▉ \| 806M/3.08G [00:05<00:11, 198MB/s] model-00002-of-00002.safetensors: 28%\|██████████████████████████▉ \| 873M/3.08G [00:05<00:10, 216MB/s] model-00002-of-00002.safetensors: 31%\|█████████████████████████████ \| 940M/3.08G [00:05<00:08, 245MB/s] model-00002-of-00002.safetensors: 33%\|██████████████████████████████▊ \| 1.01G/3.08G [00:05<00:07, 272MB/s] model-00002-of-00002.safetensors: 35%\|████████████████████████████████▊ \| 1.07G/3.08G [00:06<00:07, 280MB/s] model-00002-of-00002.safetensors: 37%\|██████████████████████████████████▊ \| 1.14G/3.08G [00:06<00:06, 295MB/s] model-00002-of-00002.safetensors: 39%\|████████████████████████████████████▉ \| 1.21G/3.08G [00:06<00:05, 315MB/s] model-00002-of-00002.safetensors: 41%\|██████████████████████████████████████▉ \| 1.28G/3.08G [00:06<00:06, 292MB/s] model-00002-of-00002.safetensors: 44%\|████████████████████████████████████████▉ \| 1.34G/3.08G [00:07<00:05, 302MB/s] model-00002-of-00002.safetensors: 46%\|███████████████████████████████████████████ \| 1.41G/3.08G [00:07<00:05, 303MB/s] model-00002-of-00002.safetensors: 48%\|█████████████████████████████████████████████ \| 1.48G/3.08G [00:07<00:05, 319MB/s] model-00002-of-00002.safetensors: 50%\|███████████████████████████████████████████████ \| 1.54G/3.08G [00:07<00:04, 318MB/s] model-00002-of-00002.safetensors: 52%\|█████████████████████████████████████████████████▏ \| 1.61G/3.08G [00:07<00:04, 339MB/s] model-00002-of-00002.safetensors: 54%\|███████████████████████████████████████████████████▏ \| 1.68G/3.08G [00:08<00:04, 337MB/s] model-00002-of-00002.safetensors: 57%\|█████████████████████████████████████████████████████▎ \| 1.74G/3.08G [00:08<00:04, 315MB/s] model-00002-of-00002.safetensors: 59%\|███████████████████████████████████████████████████████▎ \| 1.81G/3.08G [00:08<00:04, 291MB/s] model-00002-of-00002.safetensors: 61%\|█████████████████████████████████████████████████████████▎ \| 1.88G/3.08G [00:08<00:03, 314MB/s] model-00002-of-00002.safetensors: 63%\|███████████████████████████████████████████████████████████▍ \| 1.95G/3.08G [00:08<00:03, 320MB/s] model-00002-of-00002.safetensors: 65%\|█████████████████████████████████████████████████████████████▏ \| 2.00G/3.08G [00:09<00:03, 311MB/s] model-00002-of-00002.safetensors: 67%\|███████████████████████████████████████████████████████████████▎ \| 2.07G/3.08G [00:09<00:03, 299MB/s] model-00002-of-00002.safetensors: 70%\|█████████████████████████████████████████████████████████████████▎ \| 2.14G/3.08G [00:09<00:02, 320MB/s] model-00002-of-00002.safetensors: 72%\|███████████████████████████████████████████████████████████████████▍ \| 2.21G/3.08G [00:09<00:02, 343MB/s] model-00002-of-00002.safetensors: 74%\|█████████████████████████████████████████████████████████████████████▍ \| 2.27G/3.08G [00:09<00:02, 355MB/s] model-00002-of-00002.safetensors: 76%\|███████████████████████████████████████████████████████████████████████▍ \| 2.34G/3.08G [00:10<00:02, 364MB/s] model-00002-of-00002.safetensors: 78%\|█████████████████████████████████████████████████████████████████████████▌ \| 2.41G/3.08G [00:10<00:01, 367MB/s] model-00002-of-00002.safetensors: 80%\|███████████████████████████████████████████████████████████████████████████▌ \| 2.47G/3.08G [00:10<00:01, 383MB/s] model-00002-of-00002.safetensors: 83%\|█████████████████████████████████████████████████████████████████████████████▌ \| 2.54G/3.08G [00:10<00:01, 380MB/s] model-00002-of-00002.safetensors: 85%\|███████████████████████████████████████████████████████████████████████████████▋ \| 2.61G/3.08G [00:10<00:01, 304MB/s] model-00002-of-00002.safetensors: 87%\|█████████████████████████████████████████████████████████████████████████████████▋ \| 2.68G/3.08G [00:11<00:01, 281MB/s] model-00002-of-00002.safetensors: 89%\|███████████████████████████████████████████████████████████████████████████████████▊ \| 2.74G/3.08G [00:11<00:01, 306MB/s] model-00002-of-00002.safetensors: 91%\|█████████████████████████████████████████████████████████████████████████████████████▊ \| 2.81G/3.08G [00:11<00:01, 259MB/s] model-00002-of-00002.safetensors: 93%\|███████████████████████████████████████████████████████████████████████████████████████▊ \| 2.88G/3.08G [00:11<00:00, 288MB/s] model-00002-of-00002.safetensors: 96%\|█████████████████████████████████████████████████████████████████████████████████████████▉ \| 2.94G/3.08G [00:12<00:00, 299MB/s] model-00002-of-00002.safetensors: 98%\|███████████████████████████████████████████████████████████████████████████████████████████▉ \| 3.01G/3.08G [00:12<00:00, 251MB/s] model-00002-of-00002.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████\| 3.08G/3.08G [00:12<00:00, 281MB/s] model-00002-of-00002.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████\| 3.08G/3.08G [00:12<00:00, 244MB/s]
	Loading checkpoint shards: 0%\| \| 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%\|█████████████████████████████████████████████████████▌ \| 1/2 [00:06<00:06, 6.04s/it] Loading checkpoint shards: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:10<00:00, 5.01s/it] Loading checkpoint shards: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:10<00:00, 5.16s/it]
	generation_config.json: 0%\| \| 0.00/188 [00:00<?, ?B/s] generation_config.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████\| 188/188 [00:00<00:00, 2.83MB/s]
	[2025-12-27 21:18:58,111] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:122677] converting PEFT model w/ prepare_model_for_kbit_training
	[2025-12-27 21:18:58,113] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:122677] Converting modules to torch.bfloat16
	[2025-12-27 21:18:58,117] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] Memory usage after model load 5.665GB (+5.665GB allocated, +5.826GB reserved)
	trainable params: 66,060,288 \|\| all params: 4,088,528,384 \|\| trainable%: 1.6157
	[2025-12-27 21:18:58,545] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] after adapters 4.411GB (+4.411GB allocated, +5.947GB reserved)
	[2025-12-27 21:19:03,382] [INFO] [axolotl.train.save_initial_configs:398] [PID:122677] Pre-saving adapter config to ./outputs/mymodel...
	[2025-12-27 21:19:03,388] [INFO] [axolotl.train.save_initial_configs:402] [PID:122677] Pre-saving tokenizer to ./outputs/mymodel...
	[2025-12-27 21:19:03,587] [INFO] [axolotl.train.save_initial_configs:407] [PID:122677] Pre-saving model config to ./outputs/mymodel...
	[2025-12-27 21:19:03,594] [INFO] [axolotl.train.execute_training:196] [PID:122677] Starting trainer...
	0%\| \| 0/81 [00:00<?, ?it/s][2025-12-27 21:19:05,215] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
	warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

	1%\|█▋ \| 1/81 [00:26<34:57, 26.22s/it] {'loss': 0.7532, 'grad_norm': 0.6496106386184692, 'learning_rate': 0.0, 'memory/max_active (GiB)': 76.91, 'memory/max_allocated (GiB)': 76.91, 'memory/device_reserved (GiB)': 79.86, 'tokens_per_second_per_gpu': 877.99, 'epoch': 0.04}
	1%\|█▋ \| 1/81 [00:26<34:57, 26.22s/it] 2%\|███▎ \| 2/81 [00:48<31:41, 24.06s/it] {'loss': 0.6528, 'grad_norm': 0.46730196475982666, 'learning_rate': 1e-06, 'memory/max_active (GiB)': 46.79, 'memory/max_allocated (GiB)': 46.79, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 754.12, 'epoch': 0.07}
	2%\|███▎ \| 2/81 [00:48<31:41, 24.06s/it] 4%\|████▉ \| 3/81 [01:14<32:03, 24.66s/it] {'loss': 2.2564, 'grad_norm': 0.6027721762657166, 'learning_rate': 2e-06, 'memory/max_active (GiB)': 57.88, 'memory/max_allocated (GiB)': 57.88, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.03, 'epoch': 0.11}
	4%\|████▉ \| 3/81 [01:14<32:03, 24.66s/it] 5%\|██████▌ \| 4/81 [01:38<31:33, 24.59s/it] {'loss': 2.3578, 'grad_norm': 1.7880005836486816, 'learning_rate': 1.9992093972273017e-06, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 879.33, 'epoch': 0.15}
	5%\|██████▌ \| 4/81 [01:38<31:33, 24.59s/it] 6%\|████████▏ \| 5/81 [02:02<30:38, 24.18s/it] {'loss': 0.9175, 'grad_norm': 0.5934199690818787, 'learning_rate': 1.9968388390146957e-06, 'memory/max_active (GiB)': 57.81, 'memory/max_allocated (GiB)': 57.81, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 759.41, 'epoch': 0.19}
	6%\|████████▏ \| 5/81 [02:02<30:38, 24.18s/it] 7%\|█████████▊ \| 6/81 [02:25<29:58, 23.97s/it] {'loss': 1.6229, 'grad_norm': 1.5621187686920166, 'learning_rate': 1.992892073701973e-06, 'memory/max_active (GiB)': 68.44, 'memory/max_allocated (GiB)': 68.44, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 949.82, 'epoch': 0.22}
	7%\|█████████▊ \| 6/81 [02:25<29:58, 23.97s/it] 9%\|███████████▍ \| 7/81 [02:51<30:10, 24.47s/it] {'loss': 2.9604, 'grad_norm': 1.206151008605957, 'learning_rate': 1.987375341936333e-06, 'memory/max_active (GiB)': 60.61, 'memory/max_allocated (GiB)': 60.61, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 862.01, 'epoch': 0.26}
	9%\|███████████▍ \| 7/81 [02:51<30:10, 24.47s/it] 10%\|█████████████▏ \| 8/81 [03:15<29:40, 24.39s/it] {'loss': 0.8612, 'grad_norm': 0.6299921870231628, 'learning_rate': 1.9802973668046363e-06, 'memory/max_active (GiB)': 44.43, 'memory/max_allocated (GiB)': 44.43, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.47, 'epoch': 0.3}
	10%\|█████████████▏ \| 8/81 [03:15<29:40, 24.39s/it] 11%\|██████████████▊ \| 9/81 [03:41<30:02, 25.03s/it] {'loss': 1.2738, 'grad_norm': 1.1461963653564453, 'learning_rate': 1.9716693400404097e-06, 'memory/max_active (GiB)': 88.86, 'memory/max_allocated (GiB)': 88.86, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.2, 'epoch': 0.33}
	11%\|██████████████▊ \| 9/81 [03:41<30:02, 25.03s/it] 12%\|████████████████▎ \| 10/81 [04:06<29:20, 24.80s/it] {'loss': 0.9472, 'grad_norm': 0.4421791732311249, 'learning_rate': 1.9615049043274204e-06, 'memory/max_active (GiB)': 84.65, 'memory/max_allocated (GiB)': 84.65, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 819.72, 'epoch': 0.37}
	12%\|████████████████▎ \| 10/81 [04:06<29:20, 24.80s/it] 14%\|█████████████████▉ \| 11/81 [04:32<29:37, 25.40s/it] {'loss': 1.3682, 'grad_norm': 1.287254810333252, 'learning_rate': 1.949820131727783e-06, 'memory/max_active (GiB)': 86.21, 'memory/max_allocated (GiB)': 86.21, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1004.54, 'epoch': 0.41}
	14%\|█████████████████▉ \| 11/81 [04:32<29:37, 25.40s/it] 15%\|███████████████████▌ \| 12/81 [04:55<28:20, 24.65s/it] {'loss': 1.3094, 'grad_norm': 0.8032840490341187, 'learning_rate': 1.936633498268728e-06, 'memory/max_active (GiB)': 60.39, 'memory/max_allocated (GiB)': 60.39, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.54, 'epoch': 0.45}
	15%\|███████████████████▌ \| 12/81 [04:55<28:20, 24.65s/it] 16%\|█████████████████████▏ \| 13/81 [05:19<27:30, 24.27s/it] {'loss': 1.7079, 'grad_norm': 0.835660457611084, 'learning_rate': 1.9219658547282065e-06, 'memory/max_active (GiB)': 58.53, 'memory/max_allocated (GiB)': 58.53, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 804.11, 'epoch': 0.48}
	16%\|█████████████████████▏ \| 13/81 [05:19<27:30, 24.27s/it] 17%\|██████████████████████▊ \| 14/81 [05:43<27:02, 24.21s/it] {'loss': 1.4821, 'grad_norm': 0.7599063515663147, 'learning_rate': 1.9058403936655232e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 771.26, 'epoch': 0.52}
	17%\|██████████████████████▊ \| 14/81 [05:43<27:02, 24.21s/it] 19%\|████████████████████████▍ \| 15/81 [06:09<27:17, 24.81s/it] {'loss': 1.1628, 'grad_norm': 0.43679726123809814, 'learning_rate': 1.8882826127491318e-06, 'memory/max_active (GiB)': 79.08, 'memory/max_allocated (GiB)': 79.08, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 897.62, 'epoch': 0.56}
	19%\|████████████████████████▍ \| 15/81 [06:09<27:17, 24.81s/it] 20%\|██████████████████████████ \| 16/81 [06:34<26:55, 24.85s/it] {'loss': 2.6527, 'grad_norm': 0.6629171967506409, 'learning_rate': 1.8693202744395827e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 881.06, 'epoch': 0.59}
	20%\|██████████████████████████ \| 16/81 [06:34<26:55, 24.85s/it] 21%\|███████████████████████████▋ \| 17/81 [06:59<26:31, 24.86s/it] {'loss': 1.5066, 'grad_norm': 0.6648272275924683, 'learning_rate': 1.848983362091364e-06, 'memory/max_active (GiB)': 85.82, 'memory/max_allocated (GiB)': 85.82, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 0.63}
	21%\|███████████████████████████▋ \| 17/81 [06:59<26:31, 24.86s/it] 22%\|█████████████████████████████▎ \| 18/81 [07:24<26:10, 24.93s/it] {'loss': 2.1366, 'grad_norm': 1.9141907691955566, 'learning_rate': 1.8273040325430573e-06, 'memory/max_active (GiB)': 58.11, 'memory/max_allocated (GiB)': 58.11, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 996.41, 'epoch': 0.67}
	22%\|█████████████████████████████▎ \| 18/81 [07:24<26:10, 24.93s/it] 23%\|██████████████████████████████▉ \| 19/81 [07:49<25:42, 24.88s/it] {'loss': 1.7062, 'grad_norm': 1.0429956912994385, 'learning_rate': 1.8043165652707648e-06, 'memory/max_active (GiB)': 69.3, 'memory/max_allocated (GiB)': 69.3, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 839.76, 'epoch': 0.71}
	23%\|██████████████████████████████▉ \| 19/81 [07:49<25:42, 24.88s/it] 25%\|████████████████████████████████▌ \| 20/81 [08:16<26:07, 25.69s/it] {'loss': 1.7071, 'grad_norm': 0.8590395450592041, 'learning_rate': 1.780057308185212e-06, 'memory/max_active (GiB)': 89.48, 'memory/max_allocated (GiB)': 89.48, 'memory/device_reserved (GiB)': 92.15, 'tokens_per_second_per_gpu': 951.25, 'epoch': 0.74}
	25%\|████████████████████████████████▌ \| 20/81 [08:16<26:07, 25.69s/it] 26%\|██████████████████████████████████▏ \| 21/81 [08:42<25:35, 25.60s/it] {'loss': 1.4588, 'grad_norm': 0.7291064858436584, 'learning_rate': 1.75456462015823e-06, 'memory/max_active (GiB)': 89.39, 'memory/max_allocated (GiB)': 89.39, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 828.05, 'epoch': 0.78}
	26%\|██████████████████████████████████▏ \| 21/81 [08:42<25:35, 25.60s/it] 27%\|███████████████████████████████████▊ \| 22/81 [09:08<25:26, 25.87s/it] {'loss': 1.8362, 'grad_norm': 1.2527186870574951, 'learning_rate': 1.7278788103694942e-06, 'memory/max_active (GiB)': 68.62, 'memory/max_allocated (GiB)': 68.62, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 817.67, 'epoch': 0.82}
	27%\|███████████████████████████████████▊ \| 22/81 [09:08<25:26, 25.87s/it] 28%\|█████████████████████████████████████▍ \| 23/81 [09:34<24:59, 25.86s/it] {'loss': 1.5874, 'grad_norm': 1.3045358657836914, 'learning_rate': 1.7000420745694253e-06, 'memory/max_active (GiB)': 78.8, 'memory/max_allocated (GiB)': 78.8, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 864.14, 'epoch': 0.85}
	28%\|█████████████████████████████████████▍ \| 23/81 [09:34<24:59, 25.86s/it] 30%\|███████████████████████████████████████ \| 24/81 [10:00<24:31, 25.82s/it] {'loss': 2.1574, 'grad_norm': 1.6913419961929321, 'learning_rate': 1.6710984283590367e-06, 'memory/max_active (GiB)': 81.59, 'memory/max_allocated (GiB)': 81.59, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 897.53, 'epoch': 0.89}
	30%\|███████████████████████████████████████ \| 24/81 [10:00<24:31, 25.82s/it] 31%\|████████████████████████████████████████▋ \| 25/81 [10:24<23:40, 25.37s/it] {'loss': 1.058, 'grad_norm': 0.7722698450088501, 'learning_rate': 1.64109363759222e-06, 'memory/max_active (GiB)': 61.45, 'memory/max_allocated (GiB)': 61.45, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 942.84, 'epoch': 0.93}
	31%\|████████████████████████████████████████▋ \| 25/81 [10:24<23:40, 25.37s/it] 32%\|██████████████████████████████████████████▎ \| 26/81 [10:50<23:33, 25.70s/it] {'loss': 0.9906, 'grad_norm': 0.9898315072059631, 'learning_rate': 1.6100751460105243e-06, 'memory/max_active (GiB)': 86.27, 'memory/max_allocated (GiB)': 86.27, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 910.63, 'epoch': 0.97}
	32%\|██████████████████████████████████████████▎ \| 26/81 [10:50<23:33, 25.70s/it] 33%\|████████████████████████████████████████████ \| 27/81 [11:11<21:51, 24.28s/it] {'loss': 0.716, 'grad_norm': 0.5236030220985413, 'learning_rate': 1.5780920002248483e-06, 'memory/max_active (GiB)': 45.08, 'memory/max_allocated (GiB)': 45.08, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 793.15, 'epoch': 1.0}
	33%\|████████████████████████████████████████████ \| 27/81 [11:11<21:51, 24.28s/it][2025-12-27 21:30:15,957] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-27
	[2025-12-27 21:30:18,782] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
	warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

	35%\|█████████████████████████████████████████████▋ \| 28/81 [11:37<21:44, 24.62s/it] {'loss': 1.4101, 'grad_norm': 0.6720283031463623, 'learning_rate': 1.5451947721626675e-06, 'memory/max_active (GiB)': 49.49, 'memory/max_allocated (GiB)': 49.49, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 713.64, 'epoch': 1.04}
	35%\|█████████████████████████████████████████████▋ \| 28/81 [11:37<21:44, 24.62s/it] 36%\|███████████████████████████████████████████████▎ \| 29/81 [12:01<21:16, 24.55s/it] {'loss': 2.9962, 'grad_norm': 2.1970348358154297, 'learning_rate': 1.5114354791034222e-06, 'memory/max_active (GiB)': 68.78, 'memory/max_allocated (GiB)': 68.78, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 887.43, 'epoch': 1.07}
	36%\|███████████████████████████████████████████████▎ \| 29/81 [12:01<21:16, 24.55s/it] 37%\|████████████████████████████████████████████████▉ \| 30/81 [12:24<20:17, 23.88s/it] {'loss': 0.6654, 'grad_norm': 0.5169208645820618, 'learning_rate': 1.476867501428506e-06, 'memory/max_active (GiB)': 69.25, 'memory/max_allocated (GiB)': 69.25, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 837.12, 'epoch': 1.11}
	37%\|████████████████████████████████████████████████▉ \| 30/81 [12:24<20:17, 23.88s/it] 38%\|██████████████████████████████████████████████████▌ \| 31/81 [12:50<20:28, 24.58s/it] {'loss': 2.3111, 'grad_norm': 1.6675411462783813, 'learning_rate': 1.4415454982159118e-06, 'memory/max_active (GiB)': 84.87, 'memory/max_allocated (GiB)': 84.87, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 893.34, 'epoch': 1.15}
	38%\|██████████████████████████████████████████████████▌ \| 31/81 [12:50<20:28, 24.58s/it] 40%\|████████████████████████████████████████████████████▏ \| 32/81 [13:13<19:47, 24.24s/it] {'loss': 1.454, 'grad_norm': 0.8987345695495605, 'learning_rate': 1.4055253208129937e-06, 'memory/max_active (GiB)': 54.96, 'memory/max_allocated (GiB)': 54.96, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 747.48, 'epoch': 1.19}
	40%\|████████████████████████████████████████████████████▏ \| 32/81 [13:13<19:47, 24.24s/it] 41%\|█████████████████████████████████████████████████████▊ \| 33/81 [13:39<19:45, 24.71s/it] {'loss': 2.1194, 'grad_norm': 2.0316708087921143, 'learning_rate': 1.3688639245240078e-06, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 880.1, 'epoch': 1.22}
	41%\|█████████████████████████████████████████████████████▊ \| 33/81 [13:39<19:45, 24.71s/it] 42%\|███████████████████████████████████████████████████████▍ \| 34/81 [14:05<19:37, 25.04s/it] {'loss': 1.3693, 'grad_norm': 0.9677668213844299, 'learning_rate': 1.3316192785520678e-06, 'memory/max_active (GiB)': 52.15, 'memory/max_allocated (GiB)': 52.15, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 903.2, 'epoch': 1.26}
	42%\|███████████████████████████████████████████████████████▍ \| 34/81 [14:05<19:37, 25.04s/it] 43%\|█████████████████████████████████████████████████████████ \| 35/81 [14:32<19:46, 25.80s/it] {'loss': 1.5613, 'grad_norm': 1.5035587549209595, 'learning_rate': 1.2938502743379209e-06, 'memory/max_active (GiB)': 89.55, 'memory/max_allocated (GiB)': 89.55, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 1018.31, 'epoch': 1.3}
	43%\|█████████████████████████████████████████████████████████ \| 35/81 [14:32<19:46, 25.80s/it] 44%\|██████████████████████████████████████████████████████████▋ \| 36/81 [14:56<18:53, 25.18s/it] {'loss': 1.6472, 'grad_norm': 1.0176411867141724, 'learning_rate': 1.2556166324404746e-06, 'memory/max_active (GiB)': 89.41, 'memory/max_allocated (GiB)': 89.41, 'memory/device_reserved (GiB)': 90.5, 'tokens_per_second_per_gpu': 935.17, 'epoch': 1.33}
	44%\|██████████████████████████████████████████████████████████▋ \| 36/81 [14:56<18:53, 25.18s/it] 46%\|████████████████████████████████████████████████████████████▎ \| 37/81 [15:21<18:23, 25.08s/it] {'loss': 2.048, 'grad_norm': 1.1598583459854126, 'learning_rate': 1.2169788081063178e-06, 'memory/max_active (GiB)': 76.99, 'memory/max_allocated (GiB)': 76.99, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 848.84, 'epoch': 1.37}
	46%\|████████████████████████████████████████████████████████████▎ \| 37/81 [15:21<18:23, 25.08s/it] 47%\|█████████████████████████████████████████████████████████████▉ \| 38/81 [15:47<18:07, 25.30s/it] {'loss': 1.4693, 'grad_norm': 1.03346848487854, 'learning_rate': 1.1779978956775504e-06, 'memory/max_active (GiB)': 81.33, 'memory/max_allocated (GiB)': 81.33, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 864.18, 'epoch': 1.41}
	47%\|█████████████████████████████████████████████████████████████▉ \| 38/81 [15:47<18:07, 25.30s/it] 48%\|███████████████████████████████████████████████████████████████▌ \| 39/81 [16:09<17:02, 24.35s/it] {'loss': 0.629, 'grad_norm': 0.5300745964050293, 'learning_rate': 1.1387355319890683e-06, 'memory/max_active (GiB)': 38.13, 'memory/max_allocated (GiB)': 38.13, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 660.81, 'epoch': 1.45}
	48%\|███████████████████████████████████████████████████████████████▌ \| 39/81 [16:09<17:02, 24.35s/it] 49%\|█████████████████████████████████████████████████████████████████▏ \| 40/81 [16:35<17:04, 24.98s/it] {'loss': 1.354, 'grad_norm': 0.45447468757629395, 'learning_rate': 1.0992537989080618e-06, 'memory/max_active (GiB)': 68.52, 'memory/max_allocated (GiB)': 68.52, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 953.46, 'epoch': 1.48}
	49%\|█████████████████████████████████████████████████████████████████▏ \| 40/81 [16:35<17:04, 24.98s/it] 51%\|██████████████████████████████████████████████████████████████████▊ \| 41/81 [17:00<16:39, 25.00s/it] {'loss': 0.7253, 'grad_norm': 0.3777391314506531, 'learning_rate': 1.0596151251698198e-06, 'memory/max_active (GiB)': 86.09, 'memory/max_allocated (GiB)': 86.09, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 994.9, 'epoch': 1.52}
	51%\|██████████████████████████████████████████████████████████████████▊ \| 41/81 [17:00<16:39, 25.00s/it] 52%\|██████████████████████████████ \| 42/81 [17:25<16:14, 25.00s/it] {'loss': 0.5797, 'grad_norm': 0.4444688856601715, 'learning_rate': 1.01988218766507e-06, 'memory/max_active (GiB)': 67.08, 'memory/max_allocated (GiB)': 67.08, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 768.21, 'epoch': 1.56}
	52%\|██████████████████████████████ \| 42/81 [17:25<16:14, 25.00s/it] 53%\|██████████████████████████████▊ \| 43/81 [17:52<16:13, 25.61s/it] {'loss': 1.7366, 'grad_norm': 0.8071985244750977, 'learning_rate': 9.801178123349297e-07, 'memory/max_active (GiB)': 84.78, 'memory/max_allocated (GiB)': 84.78, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 1054.07, 'epoch': 1.59}
	53%\|██████████████████████████████▊ \| 43/81 [17:52<16:13, 25.61s/it] 54%\|███████████████████████████████▌ \| 44/81 [18:17<15:33, 25.23s/it] {'loss': 0.8716, 'grad_norm': 0.7125491499900818, 'learning_rate': 9.403848748301802e-07, 'memory/max_active (GiB)': 62.62, 'memory/max_allocated (GiB)': 62.62, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 829.41, 'epoch': 1.63}
	54%\|███████████████████████████████▌ \| 44/81 [18:17<15:33, 25.23s/it] 56%\|████████████████████████████████▏ \| 45/81 [18:43<15:15, 25.42s/it] {'loss': 3.5353, 'grad_norm': 1.353916883468628, 'learning_rate': 9.007462010919385e-07, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 878.82, 'epoch': 1.67}
	56%\|████████████████████████████████▏ \| 45/81 [18:43<15:15, 25.42s/it] 57%\|████████████████████████████████▉ \| 46/81 [19:06<14:29, 24.85s/it] {'loss': 1.3489, 'grad_norm': 1.1499241590499878, 'learning_rate': 8.612644680109318e-07, 'memory/max_active (GiB)': 79.0, 'memory/max_allocated (GiB)': 79.0, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 868.24, 'epoch': 1.71}
	57%\|████████████████████████████████▉ \| 46/81 [19:06<14:29, 24.85s/it] 58%\|█████████████████████████████████▋ \| 47/81 [19:31<14:08, 24.96s/it] {'loss': 1.543, 'grad_norm': 1.5999720096588135, 'learning_rate': 8.220021043224499e-07, 'memory/max_active (GiB)': 61.32, 'memory/max_allocated (GiB)': 61.32, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 935.5, 'epoch': 1.74}
	58%\|█████████████████████████████████▋ \| 47/81 [19:31<14:08, 24.96s/it] 59%\|██████████████████████████████████▎ \| 48/81 [19:56<13:39, 24.84s/it] {'loss': 1.2282, 'grad_norm': 0.930808961391449, 'learning_rate': 7.830211918936819e-07, 'memory/max_active (GiB)': 44.35, 'memory/max_allocated (GiB)': 44.35, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 827.33, 'epoch': 1.78}
	59%\|██████████████████████████████████▎ \| 48/81 [19:56<13:39, 24.84s/it] 60%\|███████████████████████████████████ \| 49/81 [20:19<12:56, 24.27s/it] {'loss': 3.0884, 'grad_norm': 1.004947543144226, 'learning_rate': 7.443833675595253e-07, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 782.62, 'epoch': 1.82}
	60%\|███████████████████████████████████ \| 49/81 [20:19<12:56, 24.27s/it] 62%\|███████████████████████████████████▊ \| 50/81 [20:44<12:37, 24.44s/it] {'loss': 1.0819, 'grad_norm': 0.8130286335945129, 'learning_rate': 7.061497256620792e-07, 'memory/max_active (GiB)': 49.75, 'memory/max_allocated (GiB)': 49.75, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 915.1, 'epoch': 1.85}
	62%\|███████████████████████████████████▊ \| 50/81 [20:44<12:37, 24.44s/it] 63%\|████████████████████████████████████▌ \| 51/81 [21:11<12:40, 25.36s/it] {'loss': 1.6019, 'grad_norm': 0.8784174919128418, 'learning_rate': 6.683807214479323e-07, 'memory/max_active (GiB)': 75.17, 'memory/max_allocated (GiB)': 75.17, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 988.63, 'epoch': 1.89}
	63%\|████████████████████████████████████▌ \| 51/81 [21:11<12:40, 25.36s/it] 64%\|█████████████████████████████████████▏ \| 52/81 [21:38<12:30, 25.88s/it] {'loss': 1.3716, 'grad_norm': 1.0731853246688843, 'learning_rate': 6.311360754759923e-07, 'memory/max_active (GiB)': 88.84, 'memory/max_allocated (GiB)': 88.84, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 865.92, 'epoch': 1.93}
	64%\|█████████████████████████████████████▏ \| 52/81 [21:38<12:30, 25.88s/it] 65%\|█████████████████████████████████████▉ \| 53/81 [22:03<11:54, 25.53s/it] {'loss': 0.4119, 'grad_norm': 0.340857595205307, 'learning_rate': 5.944746791870061e-07, 'memory/max_active (GiB)': 86.43, 'memory/max_allocated (GiB)': 86.43, 'memory/device_reserved (GiB)': 89.76, 'tokens_per_second_per_gpu': 873.98, 'epoch': 1.97}
	65%\|█████████████████████████████████████▉ \| 53/81 [22:03<11:54, 25.53s/it] 67%\|██████████████████████████████████████▋ \| 54/81 [22:25<11:04, 24.60s/it] {'loss': 0.7065, 'grad_norm': 0.48797884583473206, 'learning_rate': 5.584545017840885e-07, 'memory/max_active (GiB)': 86.08, 'memory/max_allocated (GiB)': 86.08, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 844.64, 'epoch': 2.0}
	67%\|██████████████████████████████████████▋ \| 54/81 [22:25<11:04, 24.60s/it][2025-12-27 21:41:29,979] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-54
	[2025-12-27 21:41:32,641] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
	warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

	68%\|███████████████████████████████████████▍ \| 55/81 [22:54<11:09, 25.76s/it] {'loss': 2.1499, 'grad_norm': 1.208370566368103, 'learning_rate': 5.231324985714941e-07, 'memory/max_active (GiB)': 81.46, 'memory/max_allocated (GiB)': 81.46, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 850.71, 'epoch': 2.04}
	68%\|███████████████████████████████████████▍ \| 55/81 [22:54<11:09, 25.76s/it] 69%\|████████████████████████████████████████ \| 56/81 [23:17<10:24, 24.99s/it] {'loss': 0.8085, 'grad_norm': 0.6215103268623352, 'learning_rate': 4.885645208965778e-07, 'memory/max_active (GiB)': 45.1, 'memory/max_allocated (GiB)': 45.1, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 825.2, 'epoch': 2.07}
	69%\|████████████████████████████████████████ \| 56/81 [23:17<10:24, 24.99s/it] 70%\|████████████████████████████████████████████████████████████████████████████████████████████▉ \| 57/81 [23:42<10:00, 25.02s/it] {'loss': 1.8707, 'grad_norm': 1.5340229272842407, 'learning_rate': 4.5480522783733265e-07, 'memory/max_active (GiB)': 74.01, 'memory/max_allocated (GiB)': 74.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 974.6, 'epoch': 2.11}
	70%\|████████████████████████████████████████████████████████████████████████████████████████████▉ \| 57/81 [23:42<10:00, 25.02s/it] 72%\|██████████████████████████████████████████████████████████████████████████████████████████████▌ \| 58/81 [24:06<09:27, 24.66s/it] {'loss': 1.289, 'grad_norm': 0.5400649309158325, 'learning_rate': 4.2190799977515145e-07, 'memory/max_active (GiB)': 68.43, 'memory/max_allocated (GiB)': 68.43, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 871.48, 'epoch': 2.15}
	72%\|██████████████████████████████████████████████████████████████████████████████████████████████▌ \| 58/81 [24:06<09:27, 24.66s/it] 73%\|████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 59/81 [24:32<09:11, 25.05s/it] {'loss': 0.8458, 'grad_norm': 0.4076080322265625, 'learning_rate': 3.8992485398947563e-07, 'memory/max_active (GiB)': 62.83, 'memory/max_allocated (GiB)': 62.83, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 2.19}
	73%\|████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 59/81 [24:32<09:11, 25.05s/it] 74%\|█████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 60/81 [24:56<08:39, 24.73s/it] {'loss': 1.7388, 'grad_norm': 0.8173409700393677, 'learning_rate': 3.5890636240778015e-07, 'memory/max_active (GiB)': 51.13, 'memory/max_allocated (GiB)': 51.13, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 880.54, 'epoch': 2.22}
	74%\|█████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 60/81 [24:56<08:39, 24.73s/it] 75%\|███████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 61/81 [25:19<08:04, 24.21s/it] {'loss': 0.7868, 'grad_norm': 0.6387739181518555, 'learning_rate': 3.289015716409631e-07, 'memory/max_active (GiB)': 41.57, 'memory/max_allocated (GiB)': 41.57, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 740.08, 'epoch': 2.26}
	75%\|███████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 61/81 [25:19<08:04, 24.21s/it] 77%\|█████████████████████████████████████████████████████████████████████████████████████████████████████ \| 62/81 [25:45<07:49, 24.70s/it] {'loss': 0.8116, 'grad_norm': 0.691184401512146, 'learning_rate': 2.9995792543057473e-07, 'memory/max_active (GiB)': 71.73, 'memory/max_allocated (GiB)': 71.73, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 889.11, 'epoch': 2.3}
	77%\|█████████████████████████████████████████████████████████████████████████████████████████████████████ \| 62/81 [25:45<07:49, 24.70s/it] 78%\|██████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 63/81 [26:09<07:22, 24.58s/it] {'loss': 0.6392, 'grad_norm': 0.492448627948761, 'learning_rate': 2.721211896305059e-07, 'memory/max_active (GiB)': 66.99, 'memory/max_allocated (GiB)': 66.99, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 779.69, 'epoch': 2.33}
	78%\|██████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 63/81 [26:09<07:22, 24.58s/it] 79%\|████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 64/81 [26:33<06:56, 24.47s/it] {'loss': 0.7128, 'grad_norm': 0.5834090709686279, 'learning_rate': 2.454353798417698e-07, 'memory/max_active (GiB)': 79.01, 'memory/max_allocated (GiB)': 79.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 821.37, 'epoch': 2.37}
	79%\|████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 64/81 [26:33<06:56, 24.47s/it] 80%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 65/81 [26:56<06:24, 24.04s/it] {'loss': 0.8861, 'grad_norm': 0.6599698662757874, 'learning_rate': 2.1994269181478798e-07, 'memory/max_active (GiB)': 47.26, 'memory/max_allocated (GiB)': 47.26, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 811.24, 'epoch': 2.41}
	80%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 65/81 [26:56<06:24, 24.04s/it] 81%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 66/81 [27:23<06:10, 24.71s/it] {'loss': 2.1615, 'grad_norm': 0.7803909778594971, 'learning_rate': 1.956834347292352e-07, 'memory/max_active (GiB)': 89.03, 'memory/max_allocated (GiB)': 89.03, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 882.58, 'epoch': 2.45}
	81%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 66/81 [27:23<06:10, 24.71s/it] 83%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 67/81 [27:49<05:51, 25.11s/it] {'loss': 0.9951, 'grad_norm': 0.558167576789856, 'learning_rate': 1.7269596745694292e-07, 'memory/max_active (GiB)': 89.47, 'memory/max_allocated (GiB)': 89.47, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 948.51, 'epoch': 2.48}
	83%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 67/81 [27:49<05:51, 25.11s/it] 84%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 68/81 [28:16<05:34, 25.73s/it] {'loss': 2.4148, 'grad_norm': 1.3278695344924927, 'learning_rate': 1.5101663790863595e-07, 'memory/max_active (GiB)': 89.46, 'memory/max_allocated (GiB)': 89.46, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.43, 'epoch': 2.52}
	84%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 68/81 [28:16<05:34, 25.73s/it] 85%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 69/81 [28:40<05:01, 25.12s/it] {'loss': 1.4344, 'grad_norm': 1.1428226232528687, 'learning_rate': 1.306797255604175e-07, 'memory/max_active (GiB)': 51.65, 'memory/max_allocated (GiB)': 51.65, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 852.22, 'epoch': 2.56}
	85%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 69/81 [28:40<05:01, 25.12s/it] 86%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 70/81 [29:03<04:30, 24.60s/it] {'loss': 1.6188, 'grad_norm': 1.5298364162445068, 'learning_rate': 1.1171738725086832e-07, 'memory/max_active (GiB)': 68.56, 'memory/max_allocated (GiB)': 68.56, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 861.71, 'epoch': 2.59}
	86%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 70/81 [29:03<04:30, 24.60s/it] 88%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 71/81 [29:29<04:09, 24.94s/it] {'loss': 0.6562, 'grad_norm': 0.5825881361961365, 'learning_rate': 9.415960633447673e-08, 'memory/max_active (GiB)': 86.4, 'memory/max_allocated (GiB)': 86.4, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 991.35, 'epoch': 2.63}
	88%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 71/81 [29:29<04:09, 24.94s/it] 89%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 72/81 [29:52<03:40, 24.49s/it] {'loss': 3.4467, 'grad_norm': 1.508130669593811, 'learning_rate': 7.803414527179342e-08, 'memory/max_active (GiB)': 56.32, 'memory/max_allocated (GiB)': 56.32, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 757.41, 'epoch': 2.67}
	89%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 72/81 [29:52<03:40, 24.49s/it] 90%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 73/81 [30:17<03:15, 24.48s/it] {'loss': 3.2649, 'grad_norm': 1.0750031471252441, 'learning_rate': 6.336650173127223e-08, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 793.48, 'epoch': 2.71}
	90%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ \| 73/81 [30:17<03:15, 24.48s/it] 91%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 74/81 [30:40<02:48, 24.07s/it] {'loss': 2.3341, 'grad_norm': 2.035534381866455, 'learning_rate': 5.017986827221732e-08, 'memory/max_active (GiB)': 58.54, 'memory/max_allocated (GiB)': 58.54, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 766.33, 'epoch': 2.74}
	91%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ \| 74/81 [30:40<02:48, 24.07s/it] 93%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 75/81 [31:04<02:24, 24.10s/it] {'loss': 1.6347, 'grad_norm': 1.792739748954773, 'learning_rate': 3.849509567257958e-08, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 786.7, 'epoch': 2.78}
	93%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ \| 75/81 [31:04<02:24, 24.10s/it] 94%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 76/81 [31:32<02:06, 25.35s/it] {'loss': 1.4495, 'grad_norm': 1.2769355773925781, 'learning_rate': 2.8330659959589942e-08, 'memory/max_active (GiB)': 86.1, 'memory/max_allocated (GiB)': 86.1, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1065.84, 'epoch': 2.82}
	94%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ \| 76/81 [31:32<02:06, 25.35s/it] 95%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 77/81 [31:56<01:39, 24.82s/it] {'loss': 2.2957, 'grad_norm': 1.664533257484436, 'learning_rate': 1.9702633195363917e-08, 'memory/max_active (GiB)': 60.57, 'memory/max_allocated (GiB)': 60.57, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 865.29, 'epoch': 2.85}
	95%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ \| 77/81 [31:56<01:39, 24.82s/it] 96%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 78/81 [32:24<01:17, 25.73s/it] {'loss': 1.8847, 'grad_norm': 1.805460810661316, 'learning_rate': 1.2624658063666638e-08, 'memory/max_active (GiB)': 84.93, 'memory/max_allocated (GiB)': 84.93, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 789.11, 'epoch': 2.89}
	96%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ \| 78/81 [32:24<01:17, 25.73s/it] 98%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 79/81 [32:49<00:51, 25.71s/it] {'loss': 0.6906, 'grad_norm': 0.49316051602363586, 'learning_rate': 7.10792629802659e-09, 'memory/max_active (GiB)': 79.04, 'memory/max_allocated (GiB)': 79.04, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1009.91, 'epoch': 2.93}
	98%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ \| 79/81 [32:49<00:51, 25.71s/it] 99%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 80/81 [33:16<00:26, 26.12s/it] {'loss': 1.9236, 'grad_norm': 1.247730016708374, 'learning_rate': 3.1611609853041676e-09, 'memory/max_active (GiB)': 81.45, 'memory/max_allocated (GiB)': 81.45, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.97, 'epoch': 2.97}
	99%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ \| 80/81 [33:16<00:26, 26.12s/it] 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 81/81 [33:40<00:00, 25.25s/it] {'loss': 0.9602, 'grad_norm': 0.6849121451377869, 'learning_rate': 7.906027726981567e-10, 'memory/max_active (GiB)': 76.78, 'memory/max_allocated (GiB)': 76.78, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 870.01, 'epoch': 3.0}
	100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 81/81 [33:40<00:00, 25.25s/it][2025-12-27 21:52:44,059] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-81
	{'train_runtime': 2021.9142, 'train_samples_per_second': 5.128, 'train_steps_per_second': 0.04, 'train_loss': 1.5273678048893258, 'memory/max_active (GiB)': 4.6, 'memory/max_allocated (GiB)': 4.6, 'memory/device_reserved (GiB)': 90.68, 'epoch': 3.0}
	100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 81/81 [33:41<00:00, 25.25s/it] 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 81/81 [33:41<00:00, 24.96s/it]
	[2025-12-27 21:52:46,295] [INFO] [axolotl.train.save_trained_model:218] [PID:122677] Training completed! Saving trained model to ./outputs/mymodel.
	[2025-12-27 21:52:47,133] [INFO] [axolotl.train.save_trained_model:336] [PID:122677] Model successfully saved to ./outputs/mymodel