File size: 139,835 Bytes

[2025-12-27 21:18:07,941] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:122677] bf16 support detected, enabling for this configuration.

config.json: 0.00B [00:00, ?B/s]
config.json: 1.54kB [00:00, 6.02MB/s]
[2025-12-27 21:18:08,103] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:122677] baseline 0.000GB ()
[2025-12-27 21:18:08,106] [INFO] [axolotl.cli.config.load_cfg:248] [PID:122677] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "config.yaml",
  "base_model": "BKM1804/affine-he-CIVICbeatPORSCHE",
  "base_model_config": "BKM1804/affine-he-CIVICbeatPORSCHE",
  "batch_size": 128,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_processes": 18,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "/workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl",
      "split": "train",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "gradient_accumulation_steps": 64,
  "gradient_checkpointing": false,
  "include_tkps": true,
  "learning_rate": 2e-06,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": true,
  "local_rank": 0,
  "lora_alpha": 64,
  "lora_dropout": 0.05,
  "lora_r": 32,
  "lora_target_modules": [
    "q_proj",
    "v_proj",
    "k_proj",
    "o_proj",
    "gate_proj",
    "down_proj",
    "up_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "qwen3",
  "num_epochs": 3.0,
  "optimizer": "adamw_bnb_8bit",
  "output_dir": "./outputs/mymodel",
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "BKM1804/affine-he-CIVICbeatPORSCHE",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "weight_decay": 0.0,
  "world_size": 1
}

tokenizer_config.json: 0.00B [00:00, ?B/s]
tokenizer_config.json: 5.40kB [00:00, 22.8MB/s]

vocab.json: 0.00B [00:00, ?B/s]
vocab.json: 32.8kB [00:00, 297kB/s]
vocab.json: 1.66MB [00:00, 7.30MB/s]
vocab.json: 2.78MB [00:00, 10.2MB/s]

merges.txt: 0.00B [00:00, ?B/s]
merges.txt: 43.4kB [00:00, 357kB/s]
merges.txt: 1.67MB [00:00, 8.66MB/s]

tokenizer.json:   0%|                                                                                                                        | 0.00/11.4M [00:00<?, ?B/s]
tokenizer.json:   3%|███▎                                                                                                             | 329k/11.4M [00:00<00:16, 682kB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 25.0MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 19.1MB/s]

added_tokens.json:   0%|                                                                                                                       | 0.00/707 [00:00<?, ?B/s]
added_tokens.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 707/707 [00:00<00:00, 5.99MB/s]

special_tokens_map.json:   0%|                                                                                                                 | 0.00/613 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 613/613 [00:00<00:00, 4.94MB/s]

chat_template.jinja: 0.00B [00:00, ?B/s]
chat_template.jinja: 4.93kB [00:00, 23.1MB/s]
[2025-12-27 21:18:10,826] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <|im_end|>
[2025-12-27 21:18:10,827] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None
[2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <|endoftext|>
[2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None
[2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:122677] Unable to find prepared dataset in last_run_prepared/f6b60198703671e2d2150636511428c1
[2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:122677] Loading raw datasets...
[2025-12-27 21:18:10,829] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:122677] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2025-12-27 21:18:10,933] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:122677] Loading dataset: /workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl with base_type: chat_template and prompt_style: None
[2025-12-27 21:18:10,935] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:122677] Using chat template:
---
{%- set ns = namespace(last_query_index=-1) %}
{%- for message in messages %}
    {%- if message.role == "user" %}
        {%- set ns.last_query_index = loop.index0 %}
    {%- endif %}
{%- endfor %}
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if message.content is string %}
        {%- set content = message.content %}
    {%- else %}
        {%- set content = '' %}
    {%- endif %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set has_loss = (message.loss is defined and message.loss) %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is string %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {{- '<|im_start|>' + message.role + '\n' }}
        {%- if has_loss -%}
        {%- generation -%}
        {%- if loop.index0 > ns.last_query_index and reasoning_content %}
            {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
        {%- else %}
            {{- content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>' }}
        {%- endgeneration -%}
        {{- '\n' }}
        {%- else -%}
            {%- if loop.index0 > ns.last_query_index and reasoning_content %}
                {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- content }}
            {%- endif %}
            {%- if message.tool_calls %}
                {%- for tool_call in message.tool_calls %}
                    {%- if (loop.first and content) or (not loop.first) %}
                        {{- '\n' }}
                    {%- endif %}
                    {%- if tool_call.function %}
                        {%- set tool_call = tool_call.function %}
                    {%- endif %}
                    {{- '<tool_call>\n{"name": "' }}
                    {{- tool_call.name }}
                    {{- '", "arguments": ' }}
                    {%- if tool_call.arguments is string %}
                        {{- tool_call.arguments }}
                    {%- else %}
                        {{- tool_call.arguments | tojson }}
                    {%- endif %}
                    {{- '}\n</tool_call>' }}
                {%- endfor %}
            {%- endif %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
{%- endif %}
---

Tokenizing Prompts (num_proc=18):   0%|                                                                                                  | 0/3494 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=18):   6%|████▊                                                                                  | 195/3494 [00:01<00:18, 179.32 examples/s]
Tokenizing Prompts (num_proc=18):  11%|█████████▋                                                                             | 390/3494 [00:01<00:08, 380.73 examples/s]
Tokenizing Prompts (num_proc=18):  17%|██████████████▌                                                                        | 584/3494 [00:01<00:04, 583.45 examples/s]
Tokenizing Prompts (num_proc=18):  28%|███████████████████████▉                                                              | 972/3494 [00:01<00:02, 1112.10 examples/s]
Tokenizing Prompts (num_proc=18):  39%|█████████████████████████████████                                                    | 1360/3494 [00:01<00:01, 1564.38 examples/s]
Tokenizing Prompts (num_proc=18):  50%|██████████████████████████████████████████▌                                          | 1748/3494 [00:01<00:00, 2022.08 examples/s]
Tokenizing Prompts (num_proc=18):  61%|███████████████████████████████████████████████████▉                                 | 2136/3494 [00:01<00:00, 1942.13 examples/s]
Tokenizing Prompts (num_proc=18):  89%|███████████████████████████████████████████████████████████████████████████▌         | 3106/3494 [00:02<00:00, 3193.48 examples/s]
Tokenizing Prompts (num_proc=18): 100%|█████████████████████████████████████████████████████████████████████████████████████| 3494/3494 [00:02<00:00, 2646.33 examples/s]
Tokenizing Prompts (num_proc=18): 100%|█████████████████████████████████████████████████████████████████████████████████████| 3494/3494 [00:02<00:00, 1446.93 examples/s]
[2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:122677] min_input_len: 64
[2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:122677] max_input_len: 24840

Dropping Long Sequences (>4096) (num_proc=18):   0%|                                                                                     | 0/3494 [00:00<?, ? examples/s]
Dropping Long Sequences (>4096) (num_proc=18):   6%|████▏                                                                     | 195/3494 [00:00<00:05, 564.88 examples/s]
Dropping Long Sequences (>4096) (num_proc=18): 100%|████████████████████████████████████████████████████████████████████████| 3494/3494 [00:00<00:00, 6342.01 examples/s]
[2025-12-27 21:18:14,100] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:122677] Dropped 49 samples from dataset

Saving the dataset (0/13 shards):   0%|                                                                                                  | 0/3445 [00:00<?, ? examples/s]
Saving the dataset (0/13 shards):   8%|██████▋                                                                                | 265/3445 [00:00<00:03, 901.51 examples/s]
Saving the dataset (1/13 shards):   8%|██████▋                                                                                | 265/3445 [00:00<00:03, 901.51 examples/s]
Saving the dataset (2/13 shards):  15%|█████████████▍                                                                         | 530/3445 [00:00<00:03, 901.51 examples/s]
Saving the dataset (3/13 shards):  23%|████████████████████                                                                   | 795/3445 [00:00<00:02, 901.51 examples/s]
Saving the dataset (4/13 shards):  31%|██████████████████████████▍                                                           | 1060/3445 [00:00<00:02, 901.51 examples/s]
Saving the dataset (5/13 shards):  38%|█████████████████████████████████                                                     | 1325/3445 [00:00<00:02, 901.51 examples/s]
Saving the dataset (6/13 shards):  46%|███████████████████████████████████████▋                                              | 1590/3445 [00:00<00:02, 901.51 examples/s]
Saving the dataset (7/13 shards):  54%|██████████████████████████████████████████████▎                                       | 1855/3445 [00:00<00:01, 901.51 examples/s]
Saving the dataset (8/13 shards):  62%|████████████████████████████████████████████████████▉                                 | 2120/3445 [00:00<00:01, 901.51 examples/s]
Saving the dataset (9/13 shards):  69%|███████████████████████████████████████████████████████████▌                          | 2385/3445 [00:00<00:01, 901.51 examples/s]
Saving the dataset (10/13 shards):  77%|█████████████████████████████████████████████████████████████████▍                   | 2650/3445 [00:00<00:00, 901.51 examples/s]
Saving the dataset (11/13 shards):  85%|███████████████████████████████████████████████████████████████████████▉             | 2915/3445 [00:00<00:00, 901.51 examples/s]
Saving the dataset (12/13 shards):  92%|██████████████████████████████████████████████████████████████████████████████▍      | 3180/3445 [00:00<00:00, 901.51 examples/s]
Saving the dataset (13/13 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████| 3445/3445 [00:00<00:00, 901.51 examples/s]
Saving the dataset (13/13 shards): 100%|████████████████████████████████████████████████████████████████████████████████████| 3445/3445 [00:00<00:00, 8496.02 examples/s]
[2025-12-27 21:18:14,663] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:122677] total_num_tokens: 1_863_059
[2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:122677] `total_supervised_tokens: 888_884`
[2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:122677] total_num_steps: 81
[2025-12-27 21:18:14,696] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:122677] Maximum number of steps set at 81
[2025-12-27 21:18:14,722] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:122677] Loading tokenizer... BKM1804/affine-he-CIVICbeatPORSCHE
[2025-12-27 21:18:15,206] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <|im_end|>
[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None
[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <|endoftext|>
[2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None
[2025-12-27 21:18:15,208] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:122677] Loading model
[2025-12-27 21:18:15,257] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:122677] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-12-27 21:18:15,258] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:122677] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation

model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 32.9kB [00:00, 88.3MB/s]

model-00001-of-00002.safetensors:   0%|                                                                                                      | 0.00/4.97G [00:00<?, ?B/s]
model-00001-of-00002.safetensors:   0%|                                                                                             | 680k/4.97G [00:00<2:00:53, 685kB/s]
model-00001-of-00002.safetensors:   0%|                                                                                             | 2.84M/4.97G [00:01<39:57, 2.07MB/s]
model-00001-of-00002.safetensors:   1%|█▎                                                                                           | 70.5M/4.97G [00:01<01:22, 59.1MB/s]
model-00001-of-00002.safetensors:   3%|██▌                                                                                           | 138M/4.97G [00:02<00:49, 98.0MB/s]
model-00001-of-00002.safetensors:   4%|███▉                                                                                           | 205M/4.97G [00:02<00:33, 141MB/s]
model-00001-of-00002.safetensors:   5%|█████▏                                                                                         | 272M/4.97G [00:02<00:25, 185MB/s]
model-00001-of-00002.safetensors:   7%|██████▍                                                                                        | 339M/4.97G [00:02<00:20, 222MB/s]
model-00001-of-00002.safetensors:   8%|███████▊                                                                                       | 406M/4.97G [00:03<00:19, 235MB/s]
model-00001-of-00002.safetensors:  10%|█████████                                                                                      | 473M/4.97G [00:03<00:20, 225MB/s]
model-00001-of-00002.safetensors:  11%|██████████▎                                                                                    | 540M/4.97G [00:03<00:20, 215MB/s]
model-00001-of-00002.safetensors:  12%|███████████▌                                                                                   | 607M/4.97G [00:04<00:20, 210MB/s]
model-00001-of-00002.safetensors:  14%|████████████▉                                                                                  | 674M/4.97G [00:04<00:20, 209MB/s]
model-00001-of-00002.safetensors:  15%|██████████████▏                                                                                | 741M/4.97G [00:04<00:17, 237MB/s]
model-00001-of-00002.safetensors:  16%|███████████████▍                                                                               | 808M/4.97G [00:04<00:15, 262MB/s]
model-00001-of-00002.safetensors:  18%|████████████████▋                                                                              | 875M/4.97G [00:04<00:14, 286MB/s]
model-00001-of-00002.safetensors:  19%|██████████████████                                                                             | 942M/4.97G [00:05<00:13, 294MB/s]
model-00001-of-00002.safetensors:  20%|███████████████████                                                                           | 1.01G/4.97G [00:05<00:15, 260MB/s]
model-00001-of-00002.safetensors:  22%|████████████████████▎                                                                         | 1.08G/4.97G [00:05<00:16, 237MB/s]
model-00001-of-00002.safetensors:  23%|█████████████████████▋                                                                        | 1.14G/4.97G [00:06<00:15, 251MB/s]
model-00001-of-00002.safetensors:  24%|██████████████████████▉                                                                       | 1.21G/4.97G [00:06<00:14, 262MB/s]
model-00001-of-00002.safetensors:  26%|████████████████████████▏                                                                     | 1.28G/4.97G [00:06<00:12, 289MB/s]
model-00001-of-00002.safetensors:  27%|█████████████████████████▍                                                                    | 1.34G/4.97G [00:06<00:12, 283MB/s]
model-00001-of-00002.safetensors:  28%|██████████████████████████▋                                                                   | 1.41G/4.97G [00:06<00:11, 298MB/s]
model-00001-of-00002.safetensors:  30%|███████████████████████████▉                                                                  | 1.48G/4.97G [00:07<00:10, 334MB/s]
model-00001-of-00002.safetensors:  31%|█████████████████████████████▎                                                                | 1.55G/4.97G [00:07<00:10, 318MB/s]
model-00001-of-00002.safetensors:  32%|██████████████████████████████▌                                                               | 1.61G/4.97G [00:07<00:12, 265MB/s]
model-00001-of-00002.safetensors:  34%|███████████████████████████████▊                                                              | 1.68G/4.97G [00:07<00:11, 287MB/s]
model-00001-of-00002.safetensors:  35%|█████████████████████████████████                                                             | 1.75G/4.97G [00:08<00:11, 291MB/s]
model-00001-of-00002.safetensors:  36%|██████████████████████████████████▎                                                           | 1.81G/4.97G [00:08<00:10, 290MB/s]
model-00001-of-00002.safetensors:  38%|███████████████████████████████████▌                                                          | 1.88G/4.97G [00:08<00:10, 301MB/s]
model-00001-of-00002.safetensors:  39%|████████████████████████████████████▊                                                         | 1.95G/4.97G [00:08<00:09, 305MB/s]
model-00001-of-00002.safetensors:  41%|██████████████████████████████████████                                                        | 2.01G/4.97G [00:08<00:08, 333MB/s]
model-00001-of-00002.safetensors:  42%|███████████████████████████████████████▍                                                      | 2.08G/4.97G [00:09<00:08, 346MB/s]
model-00001-of-00002.safetensors:  43%|████████████████████████████████████████▋                                                     | 2.15G/4.97G [00:09<00:07, 353MB/s]
model-00001-of-00002.safetensors:  45%|█████████████████████████████████████████▉                                                    | 2.22G/4.97G [00:09<00:07, 356MB/s]
model-00001-of-00002.safetensors:  46%|███████████████████████████████████████████▏                                                  | 2.28G/4.97G [00:09<00:07, 343MB/s]
model-00001-of-00002.safetensors:  47%|████████████████████████████████████████████▍                                                 | 2.35G/4.97G [00:09<00:07, 340MB/s]
model-00001-of-00002.safetensors:  49%|█████████████████████████████████████████████▋                                                | 2.42G/4.97G [00:10<00:07, 320MB/s]
model-00001-of-00002.safetensors:  50%|██████████████████████████████████████████████▉                                               | 2.48G/4.97G [00:10<00:07, 336MB/s]
model-00001-of-00002.safetensors:  51%|████████████████████████████████████████████████▎                                             | 2.55G/4.97G [00:10<00:07, 335MB/s]
model-00001-of-00002.safetensors:  53%|█████████████████████████████████████████████████▌                                            | 2.62G/4.97G [00:10<00:07, 295MB/s]
model-00001-of-00002.safetensors:  54%|██████████████████████████████████████████████████▊                                           | 2.68G/4.97G [00:10<00:07, 310MB/s]
model-00001-of-00002.safetensors:  55%|████████████████████████████████████████████████████                                          | 2.75G/4.97G [00:11<00:07, 309MB/s]
model-00001-of-00002.safetensors:  57%|█████████████████████████████████████████████████████▎                                        | 2.82G/4.97G [00:11<00:06, 308MB/s]
model-00001-of-00002.safetensors:  58%|██████████████████████████████████████████████████████▌                                       | 2.89G/4.97G [00:11<00:06, 330MB/s]
model-00001-of-00002.safetensors:  59%|███████████████████████████████████████████████████████▉                                      | 2.95G/4.97G [00:11<00:05, 344MB/s]
model-00001-of-00002.safetensors:  61%|█████████████████████████████████████████████████████████▏                                    | 3.02G/4.97G [00:11<00:05, 355MB/s]
model-00001-of-00002.safetensors:  62%|██████████████████████████████████████████████████████████▍                                   | 3.09G/4.97G [00:12<00:05, 360MB/s]
model-00001-of-00002.safetensors:  64%|███████████████████████████████████████████████████████████▋                                  | 3.15G/4.97G [00:12<00:06, 282MB/s]
model-00001-of-00002.safetensors:  65%|████████████████████████████████████████████████████████████▉                                 | 3.22G/4.97G [00:12<00:05, 312MB/s]
model-00001-of-00002.safetensors:  66%|██████████████████████████████████████████████████████████████▏                               | 3.29G/4.97G [00:12<00:05, 334MB/s]
model-00001-of-00002.safetensors:  68%|███████████████████████████████████████████████████████████████▌                              | 3.36G/4.97G [00:13<00:06, 253MB/s]
model-00001-of-00002.safetensors:  69%|████████████████████████████████████████████████████████████████▊                             | 3.43G/4.97G [00:13<00:05, 259MB/s]
model-00001-of-00002.safetensors:  70%|██████████████████████████████████████████████████████████████████                            | 3.49G/4.97G [00:13<00:05, 264MB/s]
model-00001-of-00002.safetensors:  72%|███████████████████████████████████████████████████████████████████▍                          | 3.56G/4.97G [00:13<00:05, 274MB/s]
model-00001-of-00002.safetensors:  73%|████████████████████████████████████████████████████████████████████▋                         | 3.63G/4.97G [00:14<00:05, 260MB/s]
model-00001-of-00002.safetensors:  74%|█████████████████████████████████████████████████████████████████████▉                        | 3.70G/4.97G [00:14<00:04, 256MB/s]
model-00001-of-00002.safetensors:  76%|███████████████████████████████████████████████████████████████████████▎                      | 3.77G/4.97G [00:14<00:04, 274MB/s]
model-00001-of-00002.safetensors:  77%|████████████████████████████████████████████████████████████████████████▍                     | 3.83G/4.97G [00:14<00:03, 296MB/s]
model-00001-of-00002.safetensors:  78%|█████████████████████████████████████████████████████████████████████████▊                    | 3.90G/4.97G [00:15<00:03, 317MB/s]
model-00001-of-00002.safetensors:  80%|███████████████████████████████████████████████████████████████████████████                   | 3.96G/4.97G [00:15<00:03, 319MB/s]
model-00001-of-00002.safetensors:  81%|████████████████████████████████████████████████████████████████████████████▎                 | 4.03G/4.97G [00:15<00:02, 324MB/s]
model-00001-of-00002.safetensors:  83%|█████████████████████████████████████████████████████████████████████████████▌                | 4.10G/4.97G [00:15<00:02, 335MB/s]
model-00001-of-00002.safetensors:  84%|██████████████████████████████████████████████████████████████████████████████▊               | 4.17G/4.97G [00:15<00:02, 321MB/s]
model-00001-of-00002.safetensors:  85%|████████████████████████████████████████████████████████████████████████████████              | 4.23G/4.97G [00:16<00:02, 313MB/s]
model-00001-of-00002.safetensors:  87%|█████████████████████████████████████████████████████████████████████████████████▎            | 4.30G/4.97G [00:16<00:02, 311MB/s]
model-00001-of-00002.safetensors:  88%|██████████████████████████████████████████████████████████████████████████████████▋           | 4.37G/4.97G [00:16<00:02, 299MB/s]
model-00001-of-00002.safetensors:  89%|███████████████████████████████████████████████████████████████████████████████████▉          | 4.43G/4.97G [00:16<00:01, 322MB/s]
model-00001-of-00002.safetensors:  91%|█████████████████████████████████████████████████████████████████████████████████████▏        | 4.50G/4.97G [00:16<00:01, 294MB/s]
model-00001-of-00002.safetensors:  92%|██████████████████████████████████████████████████████████████████████████████████████▍       | 4.57G/4.97G [00:17<00:01, 313MB/s]
model-00001-of-00002.safetensors:  93%|███████████████████████████████████████████████████████████████████████████████████████▋      | 4.63G/4.97G [00:17<00:01, 311MB/s]
model-00001-of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████████████████████████▉     | 4.70G/4.97G [00:17<00:00, 320MB/s]
model-00001-of-00002.safetensors:  96%|██████████████████████████████████████████████████████████████████████████████████████████▏   | 4.77G/4.97G [00:17<00:00, 328MB/s]
model-00001-of-00002.safetensors:  97%|███████████████████████████████████████████████████████████████████████████████████████████▍  | 4.83G/4.97G [00:17<00:00, 326MB/s]
model-00001-of-00002.safetensors:  99%|████████████████████████████████████████████████████████████████████████████████████████████▋ | 4.90G/4.97G [00:18<00:00, 346MB/s]
model-00001-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [00:18<00:00, 290MB/s]
model-00001-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [00:18<00:00, 269MB/s]

model-00002-of-00002.safetensors:   0%|                                                                                                      | 0.00/3.08G [00:00<?, ?B/s]
model-00002-of-00002.safetensors:   0%|                                                                                             | 547k/3.08G [00:00<1:11:35, 716kB/s]
model-00002-of-00002.safetensors:   0%|                                                                                             | 1.87M/3.08G [00:01<25:07, 2.04MB/s]
model-00002-of-00002.safetensors:   2%|██                                                                                           | 68.4M/3.08G [00:01<00:48, 62.3MB/s]
model-00002-of-00002.safetensors:   4%|████▏                                                                                          | 135M/3.08G [00:01<00:28, 102MB/s]
model-00002-of-00002.safetensors:   7%|██████▎                                                                                        | 203M/3.08G [00:02<00:20, 142MB/s]
model-00002-of-00002.safetensors:   9%|████████▎                                                                                      | 270M/3.08G [00:02<00:17, 162MB/s]
model-00002-of-00002.safetensors:  11%|██████████▍                                                                                    | 337M/3.08G [00:02<00:16, 165MB/s]
model-00002-of-00002.safetensors:  13%|████████████▍                                                                                  | 404M/3.08G [00:03<00:13, 200MB/s]
model-00002-of-00002.safetensors:  15%|██████████████▌                                                                                | 471M/3.08G [00:03<00:15, 167MB/s]
model-00002-of-00002.safetensors:  17%|████████████████▌                                                                              | 538M/3.08G [00:04<00:15, 168MB/s]
model-00002-of-00002.safetensors:  20%|██████████████████▋                                                                            | 605M/3.08G [00:04<00:16, 154MB/s]
model-00002-of-00002.safetensors:  22%|████████████████████▋                                                                          | 672M/3.08G [00:04<00:13, 182MB/s]
model-00002-of-00002.safetensors:  24%|██████████████████████▊                                                                        | 739M/3.08G [00:04<00:11, 210MB/s]
model-00002-of-00002.safetensors:  26%|████████████████████████▉                                                                      | 806M/3.08G [00:05<00:11, 198MB/s]
model-00002-of-00002.safetensors:  28%|██████████████████████████▉                                                                    | 873M/3.08G [00:05<00:10, 216MB/s]
model-00002-of-00002.safetensors:  31%|█████████████████████████████                                                                  | 940M/3.08G [00:05<00:08, 245MB/s]
model-00002-of-00002.safetensors:  33%|██████████████████████████████▊                                                               | 1.01G/3.08G [00:05<00:07, 272MB/s]
model-00002-of-00002.safetensors:  35%|████████████████████████████████▊                                                             | 1.07G/3.08G [00:06<00:07, 280MB/s]
model-00002-of-00002.safetensors:  37%|██████████████████████████████████▊                                                           | 1.14G/3.08G [00:06<00:06, 295MB/s]
model-00002-of-00002.safetensors:  39%|████████████████████████████████████▉                                                         | 1.21G/3.08G [00:06<00:05, 315MB/s]
model-00002-of-00002.safetensors:  41%|██████████████████████████████████████▉                                                       | 1.28G/3.08G [00:06<00:06, 292MB/s]
model-00002-of-00002.safetensors:  44%|████████████████████████████████████████▉                                                     | 1.34G/3.08G [00:07<00:05, 302MB/s]
model-00002-of-00002.safetensors:  46%|███████████████████████████████████████████                                                   | 1.41G/3.08G [00:07<00:05, 303MB/s]
model-00002-of-00002.safetensors:  48%|█████████████████████████████████████████████                                                 | 1.48G/3.08G [00:07<00:05, 319MB/s]
model-00002-of-00002.safetensors:  50%|███████████████████████████████████████████████                                               | 1.54G/3.08G [00:07<00:04, 318MB/s]
model-00002-of-00002.safetensors:  52%|█████████████████████████████████████████████████▏                                            | 1.61G/3.08G [00:07<00:04, 339MB/s]
model-00002-of-00002.safetensors:  54%|███████████████████████████████████████████████████▏                                          | 1.68G/3.08G [00:08<00:04, 337MB/s]
model-00002-of-00002.safetensors:  57%|█████████████████████████████████████████████████████▎                                        | 1.74G/3.08G [00:08<00:04, 315MB/s]
model-00002-of-00002.safetensors:  59%|███████████████████████████████████████████████████████▎                                      | 1.81G/3.08G [00:08<00:04, 291MB/s]
model-00002-of-00002.safetensors:  61%|█████████████████████████████████████████████████████████▎                                    | 1.88G/3.08G [00:08<00:03, 314MB/s]
model-00002-of-00002.safetensors:  63%|███████████████████████████████████████████████████████████▍                                  | 1.95G/3.08G [00:08<00:03, 320MB/s]
model-00002-of-00002.safetensors:  65%|█████████████████████████████████████████████████████████████▏                                | 2.00G/3.08G [00:09<00:03, 311MB/s]
model-00002-of-00002.safetensors:  67%|███████████████████████████████████████████████████████████████▎                              | 2.07G/3.08G [00:09<00:03, 299MB/s]
model-00002-of-00002.safetensors:  70%|█████████████████████████████████████████████████████████████████▎                            | 2.14G/3.08G [00:09<00:02, 320MB/s]
model-00002-of-00002.safetensors:  72%|███████████████████████████████████████████████████████████████████▍                          | 2.21G/3.08G [00:09<00:02, 343MB/s]
model-00002-of-00002.safetensors:  74%|█████████████████████████████████████████████████████████████████████▍                        | 2.27G/3.08G [00:09<00:02, 355MB/s]
model-00002-of-00002.safetensors:  76%|███████████████████████████████████████████████████████████████████████▍                      | 2.34G/3.08G [00:10<00:02, 364MB/s]
model-00002-of-00002.safetensors:  78%|█████████████████████████████████████████████████████████████████████████▌                    | 2.41G/3.08G [00:10<00:01, 367MB/s]
model-00002-of-00002.safetensors:  80%|███████████████████████████████████████████████████████████████████████████▌                  | 2.47G/3.08G [00:10<00:01, 383MB/s]
model-00002-of-00002.safetensors:  83%|█████████████████████████████████████████████████████████████████████████████▌                | 2.54G/3.08G [00:10<00:01, 380MB/s]
model-00002-of-00002.safetensors:  85%|███████████████████████████████████████████████████████████████████████████████▋              | 2.61G/3.08G [00:10<00:01, 304MB/s]
model-00002-of-00002.safetensors:  87%|█████████████████████████████████████████████████████████████████████████████████▋            | 2.68G/3.08G [00:11<00:01, 281MB/s]
model-00002-of-00002.safetensors:  89%|███████████████████████████████████████████████████████████████████████████████████▊          | 2.74G/3.08G [00:11<00:01, 306MB/s]
model-00002-of-00002.safetensors:  91%|█████████████████████████████████████████████████████████████████████████████████████▊        | 2.81G/3.08G [00:11<00:01, 259MB/s]
model-00002-of-00002.safetensors:  93%|███████████████████████████████████████████████████████████████████████████████████████▊      | 2.88G/3.08G [00:11<00:00, 288MB/s]
model-00002-of-00002.safetensors:  96%|█████████████████████████████████████████████████████████████████████████████████████████▉    | 2.94G/3.08G [00:12<00:00, 299MB/s]
model-00002-of-00002.safetensors:  98%|███████████████████████████████████████████████████████████████████████████████████████████▉  | 3.01G/3.08G [00:12<00:00, 251MB/s]
model-00002-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3.08G/3.08G [00:12<00:00, 281MB/s]
model-00002-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3.08G/3.08G [00:12<00:00, 244MB/s]

Loading checkpoint shards:   0%|                                                                                                                   | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|█████████████████████████████████████████████████████▌                                                     | 1/2 [00:06<00:06,  6.04s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.01s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.16s/it]

generation_config.json:   0%|                                                                                                                  | 0.00/188 [00:00<?, ?B/s]
generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 2.83MB/s]
[2025-12-27 21:18:58,111] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:122677] converting PEFT model w/ prepare_model_for_kbit_training
[2025-12-27 21:18:58,113] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:122677] Converting modules to torch.bfloat16
[2025-12-27 21:18:58,117] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] Memory usage after model load 5.665GB (+5.665GB allocated, +5.826GB reserved)
trainable params: 66,060,288 || all params: 4,088,528,384 || trainable%: 1.6157
[2025-12-27 21:18:58,545] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] after adapters 4.411GB (+4.411GB allocated, +5.947GB reserved)
[2025-12-27 21:19:03,382] [INFO] [axolotl.train.save_initial_configs:398] [PID:122677] Pre-saving adapter config to ./outputs/mymodel...
[2025-12-27 21:19:03,388] [INFO] [axolotl.train.save_initial_configs:402] [PID:122677] Pre-saving tokenizer to ./outputs/mymodel...
[2025-12-27 21:19:03,587] [INFO] [axolotl.train.save_initial_configs:407] [PID:122677] Pre-saving model config to ./outputs/mymodel...
[2025-12-27 21:19:03,594] [INFO] [axolotl.train.execute_training:196] [PID:122677] Starting trainer...

  0%|                                                                                                                                             | 0/81 [00:00<?, ?it/s][2025-12-27 21:19:05,215] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")


  1%|█▋                                                                                                                                   | 1/81 [00:26<34:57, 26.22s/it]
                                                                                                                                                                         
{'loss': 0.7532, 'grad_norm': 0.6496106386184692, 'learning_rate': 0.0, 'memory/max_active (GiB)': 76.91, 'memory/max_allocated (GiB)': 76.91, 'memory/device_reserved (GiB)': 79.86, 'tokens_per_second_per_gpu': 877.99, 'epoch': 0.04}

  1%|█▋                                                                                                                                   | 1/81 [00:26<34:57, 26.22s/it]
  2%|███▎                                                                                                                                 | 2/81 [00:48<31:41, 24.06s/it]
                                                                                                                                                                         
{'loss': 0.6528, 'grad_norm': 0.46730196475982666, 'learning_rate': 1e-06, 'memory/max_active (GiB)': 46.79, 'memory/max_allocated (GiB)': 46.79, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 754.12, 'epoch': 0.07}

  2%|███▎                                                                                                                                 | 2/81 [00:48<31:41, 24.06s/it]
  4%|████▉                                                                                                                                | 3/81 [01:14<32:03, 24.66s/it]
                                                                                                                                                                         
{'loss': 2.2564, 'grad_norm': 0.6027721762657166, 'learning_rate': 2e-06, 'memory/max_active (GiB)': 57.88, 'memory/max_allocated (GiB)': 57.88, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.03, 'epoch': 0.11}

  4%|████▉                                                                                                                                | 3/81 [01:14<32:03, 24.66s/it]
  5%|██████▌                                                                                                                              | 4/81 [01:38<31:33, 24.59s/it]
                                                                                                                                                                         
{'loss': 2.3578, 'grad_norm': 1.7880005836486816, 'learning_rate': 1.9992093972273017e-06, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 879.33, 'epoch': 0.15}

  5%|██████▌                                                                                                                              | 4/81 [01:38<31:33, 24.59s/it]
  6%|████████▏                                                                                                                            | 5/81 [02:02<30:38, 24.18s/it]
                                                                                                                                                                         
{'loss': 0.9175, 'grad_norm': 0.5934199690818787, 'learning_rate': 1.9968388390146957e-06, 'memory/max_active (GiB)': 57.81, 'memory/max_allocated (GiB)': 57.81, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 759.41, 'epoch': 0.19}

  6%|████████▏                                                                                                                            | 5/81 [02:02<30:38, 24.18s/it]
  7%|█████████▊                                                                                                                           | 6/81 [02:25<29:58, 23.97s/it]
                                                                                                                                                                         
{'loss': 1.6229, 'grad_norm': 1.5621187686920166, 'learning_rate': 1.992892073701973e-06, 'memory/max_active (GiB)': 68.44, 'memory/max_allocated (GiB)': 68.44, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 949.82, 'epoch': 0.22}

  7%|█████████▊                                                                                                                           | 6/81 [02:25<29:58, 23.97s/it]
  9%|███████████▍                                                                                                                         | 7/81 [02:51<30:10, 24.47s/it]
                                                                                                                                                                         
{'loss': 2.9604, 'grad_norm': 1.206151008605957, 'learning_rate': 1.987375341936333e-06, 'memory/max_active (GiB)': 60.61, 'memory/max_allocated (GiB)': 60.61, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 862.01, 'epoch': 0.26}

  9%|███████████▍                                                                                                                         | 7/81 [02:51<30:10, 24.47s/it]
 10%|█████████████▏                                                                                                                       | 8/81 [03:15<29:40, 24.39s/it]
                                                                                                                                                                         
{'loss': 0.8612, 'grad_norm': 0.6299921870231628, 'learning_rate': 1.9802973668046363e-06, 'memory/max_active (GiB)': 44.43, 'memory/max_allocated (GiB)': 44.43, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.47, 'epoch': 0.3}

 10%|█████████████▏                                                                                                                       | 8/81 [03:15<29:40, 24.39s/it]
 11%|██████████████▊                                                                                                                      | 9/81 [03:41<30:02, 25.03s/it]
                                                                                                                                                                         
{'loss': 1.2738, 'grad_norm': 1.1461963653564453, 'learning_rate': 1.9716693400404097e-06, 'memory/max_active (GiB)': 88.86, 'memory/max_allocated (GiB)': 88.86, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.2, 'epoch': 0.33}

 11%|██████████████▊                                                                                                                      | 9/81 [03:41<30:02, 25.03s/it]
 12%|████████████████▎                                                                                                                   | 10/81 [04:06<29:20, 24.80s/it]
                                                                                                                                                                         
{'loss': 0.9472, 'grad_norm': 0.4421791732311249, 'learning_rate': 1.9615049043274204e-06, 'memory/max_active (GiB)': 84.65, 'memory/max_allocated (GiB)': 84.65, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 819.72, 'epoch': 0.37}

 12%|████████████████▎                                                                                                                   | 10/81 [04:06<29:20, 24.80s/it]
 14%|█████████████████▉                                                                                                                  | 11/81 [04:32<29:37, 25.40s/it]
                                                                                                                                                                         
{'loss': 1.3682, 'grad_norm': 1.287254810333252, 'learning_rate': 1.949820131727783e-06, 'memory/max_active (GiB)': 86.21, 'memory/max_allocated (GiB)': 86.21, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1004.54, 'epoch': 0.41}

 14%|█████████████████▉                                                                                                                  | 11/81 [04:32<29:37, 25.40s/it]
 15%|███████████████████▌                                                                                                                | 12/81 [04:55<28:20, 24.65s/it]
                                                                                                                                                                         
{'loss': 1.3094, 'grad_norm': 0.8032840490341187, 'learning_rate': 1.936633498268728e-06, 'memory/max_active (GiB)': 60.39, 'memory/max_allocated (GiB)': 60.39, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.54, 'epoch': 0.45}

 15%|███████████████████▌                                                                                                                | 12/81 [04:55<28:20, 24.65s/it]
 16%|█████████████████████▏                                                                                                              | 13/81 [05:19<27:30, 24.27s/it]
                                                                                                                                                                         
{'loss': 1.7079, 'grad_norm': 0.835660457611084, 'learning_rate': 1.9219658547282065e-06, 'memory/max_active (GiB)': 58.53, 'memory/max_allocated (GiB)': 58.53, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 804.11, 'epoch': 0.48}

 16%|█████████████████████▏                                                                                                              | 13/81 [05:19<27:30, 24.27s/it]
 17%|██████████████████████▊                                                                                                             | 14/81 [05:43<27:02, 24.21s/it]
                                                                                                                                                                         
{'loss': 1.4821, 'grad_norm': 0.7599063515663147, 'learning_rate': 1.9058403936655232e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 771.26, 'epoch': 0.52}

 17%|██████████████████████▊                                                                                                             | 14/81 [05:43<27:02, 24.21s/it]
 19%|████████████████████████▍                                                                                                           | 15/81 [06:09<27:17, 24.81s/it]
                                                                                                                                                                         
{'loss': 1.1628, 'grad_norm': 0.43679726123809814, 'learning_rate': 1.8882826127491318e-06, 'memory/max_active (GiB)': 79.08, 'memory/max_allocated (GiB)': 79.08, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 897.62, 'epoch': 0.56}

 19%|████████████████████████▍                                                                                                           | 15/81 [06:09<27:17, 24.81s/it]
 20%|██████████████████████████                                                                                                          | 16/81 [06:34<26:55, 24.85s/it]
                                                                                                                                                                         
{'loss': 2.6527, 'grad_norm': 0.6629171967506409, 'learning_rate': 1.8693202744395827e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 881.06, 'epoch': 0.59}

 20%|██████████████████████████                                                                                                          | 16/81 [06:34<26:55, 24.85s/it]
 21%|███████████████████████████▋                                                                                                        | 17/81 [06:59<26:31, 24.86s/it]
                                                                                                                                                                         
{'loss': 1.5066, 'grad_norm': 0.6648272275924683, 'learning_rate': 1.848983362091364e-06, 'memory/max_active (GiB)': 85.82, 'memory/max_allocated (GiB)': 85.82, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 0.63}

 21%|███████████████████████████▋                                                                                                        | 17/81 [06:59<26:31, 24.86s/it]
 22%|█████████████████████████████▎                                                                                                      | 18/81 [07:24<26:10, 24.93s/it]
                                                                                                                                                                         
{'loss': 2.1366, 'grad_norm': 1.9141907691955566, 'learning_rate': 1.8273040325430573e-06, 'memory/max_active (GiB)': 58.11, 'memory/max_allocated (GiB)': 58.11, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 996.41, 'epoch': 0.67}

 22%|█████████████████████████████▎                                                                                                      | 18/81 [07:24<26:10, 24.93s/it]
 23%|██████████████████████████████▉                                                                                                     | 19/81 [07:49<25:42, 24.88s/it]
                                                                                                                                                                         
{'loss': 1.7062, 'grad_norm': 1.0429956912994385, 'learning_rate': 1.8043165652707648e-06, 'memory/max_active (GiB)': 69.3, 'memory/max_allocated (GiB)': 69.3, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 839.76, 'epoch': 0.71}

 23%|██████████████████████████████▉                                                                                                     | 19/81 [07:49<25:42, 24.88s/it]
 25%|████████████████████████████████▌                                                                                                   | 20/81 [08:16<26:07, 25.69s/it]
                                                                                                                                                                         
{'loss': 1.7071, 'grad_norm': 0.8590395450592041, 'learning_rate': 1.780057308185212e-06, 'memory/max_active (GiB)': 89.48, 'memory/max_allocated (GiB)': 89.48, 'memory/device_reserved (GiB)': 92.15, 'tokens_per_second_per_gpu': 951.25, 'epoch': 0.74}

 25%|████████████████████████████████▌                                                                                                   | 20/81 [08:16<26:07, 25.69s/it]
 26%|██████████████████████████████████▏                                                                                                 | 21/81 [08:42<25:35, 25.60s/it]
                                                                                                                                                                         
{'loss': 1.4588, 'grad_norm': 0.7291064858436584, 'learning_rate': 1.75456462015823e-06, 'memory/max_active (GiB)': 89.39, 'memory/max_allocated (GiB)': 89.39, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 828.05, 'epoch': 0.78}

 26%|██████████████████████████████████▏                                                                                                 | 21/81 [08:42<25:35, 25.60s/it]
 27%|███████████████████████████████████▊                                                                                                | 22/81 [09:08<25:26, 25.87s/it]
                                                                                                                                                                         
{'loss': 1.8362, 'grad_norm': 1.2527186870574951, 'learning_rate': 1.7278788103694942e-06, 'memory/max_active (GiB)': 68.62, 'memory/max_allocated (GiB)': 68.62, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 817.67, 'epoch': 0.82}

 27%|███████████████████████████████████▊                                                                                                | 22/81 [09:08<25:26, 25.87s/it]
 28%|█████████████████████████████████████▍                                                                                              | 23/81 [09:34<24:59, 25.86s/it]
                                                                                                                                                                         
{'loss': 1.5874, 'grad_norm': 1.3045358657836914, 'learning_rate': 1.7000420745694253e-06, 'memory/max_active (GiB)': 78.8, 'memory/max_allocated (GiB)': 78.8, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 864.14, 'epoch': 0.85}

 28%|█████████████████████████████████████▍                                                                                              | 23/81 [09:34<24:59, 25.86s/it]
 30%|███████████████████████████████████████                                                                                             | 24/81 [10:00<24:31, 25.82s/it]
                                                                                                                                                                         
{'loss': 2.1574, 'grad_norm': 1.6913419961929321, 'learning_rate': 1.6710984283590367e-06, 'memory/max_active (GiB)': 81.59, 'memory/max_allocated (GiB)': 81.59, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 897.53, 'epoch': 0.89}

 30%|███████████████████████████████████████                                                                                             | 24/81 [10:00<24:31, 25.82s/it]
 31%|████████████████████████████████████████▋                                                                                           | 25/81 [10:24<23:40, 25.37s/it]
                                                                                                                                                                         
{'loss': 1.058, 'grad_norm': 0.7722698450088501, 'learning_rate': 1.64109363759222e-06, 'memory/max_active (GiB)': 61.45, 'memory/max_allocated (GiB)': 61.45, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 942.84, 'epoch': 0.93}

 31%|████████████████████████████████████████▋                                                                                           | 25/81 [10:24<23:40, 25.37s/it]
 32%|██████████████████████████████████████████▎                                                                                         | 26/81 [10:50<23:33, 25.70s/it]
                                                                                                                                                                         
{'loss': 0.9906, 'grad_norm': 0.9898315072059631, 'learning_rate': 1.6100751460105243e-06, 'memory/max_active (GiB)': 86.27, 'memory/max_allocated (GiB)': 86.27, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 910.63, 'epoch': 0.97}

 32%|██████████████████████████████████████████▎                                                                                         | 26/81 [10:50<23:33, 25.70s/it]
 33%|████████████████████████████████████████████                                                                                        | 27/81 [11:11<21:51, 24.28s/it]
                                                                                                                                                                         
{'loss': 0.716, 'grad_norm': 0.5236030220985413, 'learning_rate': 1.5780920002248483e-06, 'memory/max_active (GiB)': 45.08, 'memory/max_allocated (GiB)': 45.08, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 793.15, 'epoch': 1.0}

 33%|████████████████████████████████████████████                                                                                        | 27/81 [11:11<21:51, 24.28s/it][2025-12-27 21:30:15,957] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-27
[2025-12-27 21:30:18,782] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")


 35%|█████████████████████████████████████████████▋                                                                                      | 28/81 [11:37<21:44, 24.62s/it]
                                                                                                                                                                         
{'loss': 1.4101, 'grad_norm': 0.6720283031463623, 'learning_rate': 1.5451947721626675e-06, 'memory/max_active (GiB)': 49.49, 'memory/max_allocated (GiB)': 49.49, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 713.64, 'epoch': 1.04}

 35%|█████████████████████████████████████████████▋                                                                                      | 28/81 [11:37<21:44, 24.62s/it]
 36%|███████████████████████████████████████████████▎                                                                                    | 29/81 [12:01<21:16, 24.55s/it]
                                                                                                                                                                         
{'loss': 2.9962, 'grad_norm': 2.1970348358154297, 'learning_rate': 1.5114354791034222e-06, 'memory/max_active (GiB)': 68.78, 'memory/max_allocated (GiB)': 68.78, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 887.43, 'epoch': 1.07}

 36%|███████████████████████████████████████████████▎                                                                                    | 29/81 [12:01<21:16, 24.55s/it]
 37%|████████████████████████████████████████████████▉                                                                                   | 30/81 [12:24<20:17, 23.88s/it]
                                                                                                                                                                         
{'loss': 0.6654, 'grad_norm': 0.5169208645820618, 'learning_rate': 1.476867501428506e-06, 'memory/max_active (GiB)': 69.25, 'memory/max_allocated (GiB)': 69.25, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 837.12, 'epoch': 1.11}

 37%|████████████████████████████████████████████████▉                                                                                   | 30/81 [12:24<20:17, 23.88s/it]
 38%|██████████████████████████████████████████████████▌                                                                                 | 31/81 [12:50<20:28, 24.58s/it]
                                                                                                                                                                         
{'loss': 2.3111, 'grad_norm': 1.6675411462783813, 'learning_rate': 1.4415454982159118e-06, 'memory/max_active (GiB)': 84.87, 'memory/max_allocated (GiB)': 84.87, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 893.34, 'epoch': 1.15}

 38%|██████████████████████████████████████████████████▌                                                                                 | 31/81 [12:50<20:28, 24.58s/it]
 40%|████████████████████████████████████████████████████▏                                                                               | 32/81 [13:13<19:47, 24.24s/it]
                                                                                                                                                                         
{'loss': 1.454, 'grad_norm': 0.8987345695495605, 'learning_rate': 1.4055253208129937e-06, 'memory/max_active (GiB)': 54.96, 'memory/max_allocated (GiB)': 54.96, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 747.48, 'epoch': 1.19}

 40%|████████████████████████████████████████████████████▏                                                                               | 32/81 [13:13<19:47, 24.24s/it]
 41%|█████████████████████████████████████████████████████▊                                                                              | 33/81 [13:39<19:45, 24.71s/it]
                                                                                                                                                                         
{'loss': 2.1194, 'grad_norm': 2.0316708087921143, 'learning_rate': 1.3688639245240078e-06, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 880.1, 'epoch': 1.22}

 41%|█████████████████████████████████████████████████████▊                                                                              | 33/81 [13:39<19:45, 24.71s/it]
 42%|███████████████████████████████████████████████████████▍                                                                            | 34/81 [14:05<19:37, 25.04s/it]
                                                                                                                                                                         
{'loss': 1.3693, 'grad_norm': 0.9677668213844299, 'learning_rate': 1.3316192785520678e-06, 'memory/max_active (GiB)': 52.15, 'memory/max_allocated (GiB)': 52.15, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 903.2, 'epoch': 1.26}

 42%|███████████████████████████████████████████████████████▍                                                                            | 34/81 [14:05<19:37, 25.04s/it]
 43%|█████████████████████████████████████████████████████████                                                                           | 35/81 [14:32<19:46, 25.80s/it]
                                                                                                                                                                         
{'loss': 1.5613, 'grad_norm': 1.5035587549209595, 'learning_rate': 1.2938502743379209e-06, 'memory/max_active (GiB)': 89.55, 'memory/max_allocated (GiB)': 89.55, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 1018.31, 'epoch': 1.3}

 43%|█████████████████████████████████████████████████████████                                                                           | 35/81 [14:32<19:46, 25.80s/it]
 44%|██████████████████████████████████████████████████████████▋                                                                         | 36/81 [14:56<18:53, 25.18s/it]
                                                                                                                                                                         
{'loss': 1.6472, 'grad_norm': 1.0176411867141724, 'learning_rate': 1.2556166324404746e-06, 'memory/max_active (GiB)': 89.41, 'memory/max_allocated (GiB)': 89.41, 'memory/device_reserved (GiB)': 90.5, 'tokens_per_second_per_gpu': 935.17, 'epoch': 1.33}

 44%|██████████████████████████████████████████████████████████▋                                                                         | 36/81 [14:56<18:53, 25.18s/it]
 46%|████████████████████████████████████████████████████████████▎                                                                       | 37/81 [15:21<18:23, 25.08s/it]
                                                                                                                                                                         
{'loss': 2.048, 'grad_norm': 1.1598583459854126, 'learning_rate': 1.2169788081063178e-06, 'memory/max_active (GiB)': 76.99, 'memory/max_allocated (GiB)': 76.99, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 848.84, 'epoch': 1.37}

 46%|████████████████████████████████████████████████████████████▎                                                                       | 37/81 [15:21<18:23, 25.08s/it]
 47%|█████████████████████████████████████████████████████████████▉                                                                      | 38/81 [15:47<18:07, 25.30s/it]
                                                                                                                                                                         
{'loss': 1.4693, 'grad_norm': 1.03346848487854, 'learning_rate': 1.1779978956775504e-06, 'memory/max_active (GiB)': 81.33, 'memory/max_allocated (GiB)': 81.33, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 864.18, 'epoch': 1.41}

 47%|█████████████████████████████████████████████████████████████▉                                                                      | 38/81 [15:47<18:07, 25.30s/it]
 48%|███████████████████████████████████████████████████████████████▌                                                                    | 39/81 [16:09<17:02, 24.35s/it]
                                                                                                                                                                         
{'loss': 0.629, 'grad_norm': 0.5300745964050293, 'learning_rate': 1.1387355319890683e-06, 'memory/max_active (GiB)': 38.13, 'memory/max_allocated (GiB)': 38.13, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 660.81, 'epoch': 1.45}

 48%|███████████████████████████████████████████████████████████████▌                                                                    | 39/81 [16:09<17:02, 24.35s/it]
 49%|█████████████████████████████████████████████████████████████████▏                                                                  | 40/81 [16:35<17:04, 24.98s/it]
                                                                                                                                                                         
{'loss': 1.354, 'grad_norm': 0.45447468757629395, 'learning_rate': 1.0992537989080618e-06, 'memory/max_active (GiB)': 68.52, 'memory/max_allocated (GiB)': 68.52, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 953.46, 'epoch': 1.48}

 49%|█████████████████████████████████████████████████████████████████▏                                                                  | 40/81 [16:35<17:04, 24.98s/it]
 51%|██████████████████████████████████████████████████████████████████▊                                                                 | 41/81 [17:00<16:39, 25.00s/it]
                                                                                                                                                                         
{'loss': 0.7253, 'grad_norm': 0.3777391314506531, 'learning_rate': 1.0596151251698198e-06, 'memory/max_active (GiB)': 86.09, 'memory/max_allocated (GiB)': 86.09, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 994.9, 'epoch': 1.52}

 51%|██████████████████████████████████████████████████████████████████▊                                                                 | 41/81 [17:00<16:39, 25.00s/it]
 52%|██████████████████████████████                            | 42/81 [17:25<16:14, 25.00s/it]                                                                          
                                                                                               
{'loss': 0.5797, 'grad_norm': 0.4444688856601715, 'learning_rate': 1.01988218766507e-06, 'memory/max_active (GiB)': 67.08, 'memory/max_allocated (GiB)': 67.08, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 768.21, 'epoch': 1.56}

 52%|██████████████████████████████                            | 42/81 [17:25<16:14, 25.00s/it]
 53%|██████████████████████████████▊                           | 43/81 [17:52<16:13, 25.61s/it]
                                                                                               
{'loss': 1.7366, 'grad_norm': 0.8071985244750977, 'learning_rate': 9.801178123349297e-07, 'memory/max_active (GiB)': 84.78, 'memory/max_allocated (GiB)': 84.78, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 1054.07, 'epoch': 1.59}

 53%|██████████████████████████████▊                           | 43/81 [17:52<16:13, 25.61s/it]
 54%|███████████████████████████████▌                          | 44/81 [18:17<15:33, 25.23s/it]
                                                                                               
{'loss': 0.8716, 'grad_norm': 0.7125491499900818, 'learning_rate': 9.403848748301802e-07, 'memory/max_active (GiB)': 62.62, 'memory/max_allocated (GiB)': 62.62, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 829.41, 'epoch': 1.63}

 54%|███████████████████████████████▌                          | 44/81 [18:17<15:33, 25.23s/it]
 56%|████████████████████████████████▏                         | 45/81 [18:43<15:15, 25.42s/it]
                                                                                               
{'loss': 3.5353, 'grad_norm': 1.353916883468628, 'learning_rate': 9.007462010919385e-07, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 878.82, 'epoch': 1.67}

 56%|████████████████████████████████▏                         | 45/81 [18:43<15:15, 25.42s/it]
 57%|████████████████████████████████▉                         | 46/81 [19:06<14:29, 24.85s/it]
                                                                                               
{'loss': 1.3489, 'grad_norm': 1.1499241590499878, 'learning_rate': 8.612644680109318e-07, 'memory/max_active (GiB)': 79.0, 'memory/max_allocated (GiB)': 79.0, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 868.24, 'epoch': 1.71}

 57%|████████████████████████████████▉                         | 46/81 [19:06<14:29, 24.85s/it]
 58%|█████████████████████████████████▋                        | 47/81 [19:31<14:08, 24.96s/it]
                                                                                               
{'loss': 1.543, 'grad_norm': 1.5999720096588135, 'learning_rate': 8.220021043224499e-07, 'memory/max_active (GiB)': 61.32, 'memory/max_allocated (GiB)': 61.32, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 935.5, 'epoch': 1.74}

 58%|█████████████████████████████████▋                        | 47/81 [19:31<14:08, 24.96s/it]
 59%|██████████████████████████████████▎                       | 48/81 [19:56<13:39, 24.84s/it]
                                                                                               
{'loss': 1.2282, 'grad_norm': 0.930808961391449, 'learning_rate': 7.830211918936819e-07, 'memory/max_active (GiB)': 44.35, 'memory/max_allocated (GiB)': 44.35, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 827.33, 'epoch': 1.78}

 59%|██████████████████████████████████▎                       | 48/81 [19:56<13:39, 24.84s/it]
 60%|███████████████████████████████████                       | 49/81 [20:19<12:56, 24.27s/it]
                                                                                               
{'loss': 3.0884, 'grad_norm': 1.004947543144226, 'learning_rate': 7.443833675595253e-07, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 782.62, 'epoch': 1.82}

 60%|███████████████████████████████████                       | 49/81 [20:19<12:56, 24.27s/it]
 62%|███████████████████████████████████▊                      | 50/81 [20:44<12:37, 24.44s/it]
                                                                                               
{'loss': 1.0819, 'grad_norm': 0.8130286335945129, 'learning_rate': 7.061497256620792e-07, 'memory/max_active (GiB)': 49.75, 'memory/max_allocated (GiB)': 49.75, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 915.1, 'epoch': 1.85}

 62%|███████████████████████████████████▊                      | 50/81 [20:44<12:37, 24.44s/it]
 63%|████████████████████████████████████▌                     | 51/81 [21:11<12:40, 25.36s/it]
                                                                                               
{'loss': 1.6019, 'grad_norm': 0.8784174919128418, 'learning_rate': 6.683807214479323e-07, 'memory/max_active (GiB)': 75.17, 'memory/max_allocated (GiB)': 75.17, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 988.63, 'epoch': 1.89}

 63%|████████████████████████████████████▌                     | 51/81 [21:11<12:40, 25.36s/it]
 64%|█████████████████████████████████████▏                    | 52/81 [21:38<12:30, 25.88s/it]
                                                                                               
{'loss': 1.3716, 'grad_norm': 1.0731853246688843, 'learning_rate': 6.311360754759923e-07, 'memory/max_active (GiB)': 88.84, 'memory/max_allocated (GiB)': 88.84, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 865.92, 'epoch': 1.93}

 64%|█████████████████████████████████████▏                    | 52/81 [21:38<12:30, 25.88s/it]
 65%|█████████████████████████████████████▉                    | 53/81 [22:03<11:54, 25.53s/it]
                                                                                               
{'loss': 0.4119, 'grad_norm': 0.340857595205307, 'learning_rate': 5.944746791870061e-07, 'memory/max_active (GiB)': 86.43, 'memory/max_allocated (GiB)': 86.43, 'memory/device_reserved (GiB)': 89.76, 'tokens_per_second_per_gpu': 873.98, 'epoch': 1.97}

 65%|█████████████████████████████████████▉                    | 53/81 [22:03<11:54, 25.53s/it]
 67%|██████████████████████████████████████▋                   | 54/81 [22:25<11:04, 24.60s/it]
                                                                                               
{'loss': 0.7065, 'grad_norm': 0.48797884583473206, 'learning_rate': 5.584545017840885e-07, 'memory/max_active (GiB)': 86.08, 'memory/max_allocated (GiB)': 86.08, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 844.64, 'epoch': 2.0}

 67%|██████████████████████████████████████▋                   | 54/81 [22:25<11:04, 24.60s/it][2025-12-27 21:41:29,979] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-54
[2025-12-27 21:41:32,641] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")


 68%|███████████████████████████████████████▍                  | 55/81 [22:54<11:09, 25.76s/it]
                                                                                               
{'loss': 2.1499, 'grad_norm': 1.208370566368103, 'learning_rate': 5.231324985714941e-07, 'memory/max_active (GiB)': 81.46, 'memory/max_allocated (GiB)': 81.46, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 850.71, 'epoch': 2.04}

 68%|███████████████████████████████████████▍                  | 55/81 [22:54<11:09, 25.76s/it]
 69%|████████████████████████████████████████                  | 56/81 [23:17<10:24, 24.99s/it]
                                                                                               
{'loss': 0.8085, 'grad_norm': 0.6215103268623352, 'learning_rate': 4.885645208965778e-07, 'memory/max_active (GiB)': 45.1, 'memory/max_allocated (GiB)': 45.1, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 825.2, 'epoch': 2.07}

 69%|████████████████████████████████████████                  | 56/81 [23:17<10:24, 24.99s/it]
 70%|████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 57/81 [23:42<10:00, 25.02s/it]
                                                                                                                                                                         
{'loss': 1.8707, 'grad_norm': 1.5340229272842407, 'learning_rate': 4.5480522783733265e-07, 'memory/max_active (GiB)': 74.01, 'memory/max_allocated (GiB)': 74.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 974.6, 'epoch': 2.11}

 70%|████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 57/81 [23:42<10:00, 25.02s/it]
 72%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 58/81 [24:06<09:27, 24.66s/it]
                                                                                                                                                                         
{'loss': 1.289, 'grad_norm': 0.5400649309158325, 'learning_rate': 4.2190799977515145e-07, 'memory/max_active (GiB)': 68.43, 'memory/max_allocated (GiB)': 68.43, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 871.48, 'epoch': 2.15}

 72%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 58/81 [24:06<09:27, 24.66s/it]
 73%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 59/81 [24:32<09:11, 25.05s/it]
                                                                                                                                                                         
{'loss': 0.8458, 'grad_norm': 0.4076080322265625, 'learning_rate': 3.8992485398947563e-07, 'memory/max_active (GiB)': 62.83, 'memory/max_allocated (GiB)': 62.83, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 2.19}

 73%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 59/81 [24:32<09:11, 25.05s/it]
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 60/81 [24:56<08:39, 24.73s/it]
                                                                                                                                                                         
{'loss': 1.7388, 'grad_norm': 0.8173409700393677, 'learning_rate': 3.5890636240778015e-07, 'memory/max_active (GiB)': 51.13, 'memory/max_allocated (GiB)': 51.13, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 880.54, 'epoch': 2.22}

 74%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 60/81 [24:56<08:39, 24.73s/it]
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 61/81 [25:19<08:04, 24.21s/it]
                                                                                                                                                                         
{'loss': 0.7868, 'grad_norm': 0.6387739181518555, 'learning_rate': 3.289015716409631e-07, 'memory/max_active (GiB)': 41.57, 'memory/max_allocated (GiB)': 41.57, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 740.08, 'epoch': 2.26}

 75%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 61/81 [25:19<08:04, 24.21s/it]
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████                               | 62/81 [25:45<07:49, 24.70s/it]
                                                                                                                                                                         
{'loss': 0.8116, 'grad_norm': 0.691184401512146, 'learning_rate': 2.9995792543057473e-07, 'memory/max_active (GiB)': 71.73, 'memory/max_allocated (GiB)': 71.73, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 889.11, 'epoch': 2.3}

 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████                               | 62/81 [25:45<07:49, 24.70s/it]
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 63/81 [26:09<07:22, 24.58s/it]
                                                                                                                                                                         
{'loss': 0.6392, 'grad_norm': 0.492448627948761, 'learning_rate': 2.721211896305059e-07, 'memory/max_active (GiB)': 66.99, 'memory/max_allocated (GiB)': 66.99, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 779.69, 'epoch': 2.33}

 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 63/81 [26:09<07:22, 24.58s/it]
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 64/81 [26:33<06:56, 24.47s/it]
                                                                                                                                                                         
{'loss': 0.7128, 'grad_norm': 0.5834090709686279, 'learning_rate': 2.454353798417698e-07, 'memory/max_active (GiB)': 79.01, 'memory/max_allocated (GiB)': 79.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 821.37, 'epoch': 2.37}

 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 64/81 [26:33<06:56, 24.47s/it]
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 65/81 [26:56<06:24, 24.04s/it]
                                                                                                                                                                         
{'loss': 0.8861, 'grad_norm': 0.6599698662757874, 'learning_rate': 2.1994269181478798e-07, 'memory/max_active (GiB)': 47.26, 'memory/max_allocated (GiB)': 47.26, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 811.24, 'epoch': 2.41}

 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 65/81 [26:56<06:24, 24.04s/it]
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 66/81 [27:23<06:10, 24.71s/it]
                                                                                                                                                                         
{'loss': 2.1615, 'grad_norm': 0.7803909778594971, 'learning_rate': 1.956834347292352e-07, 'memory/max_active (GiB)': 89.03, 'memory/max_allocated (GiB)': 89.03, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 882.58, 'epoch': 2.45}

 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 66/81 [27:23<06:10, 24.71s/it]
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 67/81 [27:49<05:51, 25.11s/it]
                                                                                                                                                                         
{'loss': 0.9951, 'grad_norm': 0.558167576789856, 'learning_rate': 1.7269596745694292e-07, 'memory/max_active (GiB)': 89.47, 'memory/max_allocated (GiB)': 89.47, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 948.51, 'epoch': 2.48}

 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 67/81 [27:49<05:51, 25.11s/it]
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 68/81 [28:16<05:34, 25.73s/it]
                                                                                                                                                                         
{'loss': 2.4148, 'grad_norm': 1.3278695344924927, 'learning_rate': 1.5101663790863595e-07, 'memory/max_active (GiB)': 89.46, 'memory/max_allocated (GiB)': 89.46, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.43, 'epoch': 2.52}

 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 68/81 [28:16<05:34, 25.73s/it]
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 69/81 [28:40<05:01, 25.12s/it]
                                                                                                                                                                         
{'loss': 1.4344, 'grad_norm': 1.1428226232528687, 'learning_rate': 1.306797255604175e-07, 'memory/max_active (GiB)': 51.65, 'memory/max_allocated (GiB)': 51.65, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 852.22, 'epoch': 2.56}

 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 69/81 [28:40<05:01, 25.12s/it]
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 70/81 [29:03<04:30, 24.60s/it]
                                                                                                                                                                         
{'loss': 1.6188, 'grad_norm': 1.5298364162445068, 'learning_rate': 1.1171738725086832e-07, 'memory/max_active (GiB)': 68.56, 'memory/max_allocated (GiB)': 68.56, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 861.71, 'epoch': 2.59}

 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 70/81 [29:03<04:30, 24.60s/it]
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 71/81 [29:29<04:09, 24.94s/it]
                                                                                                                                                                         
{'loss': 0.6562, 'grad_norm': 0.5825881361961365, 'learning_rate': 9.415960633447673e-08, 'memory/max_active (GiB)': 86.4, 'memory/max_allocated (GiB)': 86.4, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 991.35, 'epoch': 2.63}

 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 71/81 [29:29<04:09, 24.94s/it]
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 72/81 [29:52<03:40, 24.49s/it]
                                                                                                                                                                         
{'loss': 3.4467, 'grad_norm': 1.508130669593811, 'learning_rate': 7.803414527179342e-08, 'memory/max_active (GiB)': 56.32, 'memory/max_allocated (GiB)': 56.32, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 757.41, 'epoch': 2.67}

 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 72/81 [29:52<03:40, 24.49s/it]
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 73/81 [30:17<03:15, 24.48s/it]
                                                                                                                                                                         
{'loss': 3.2649, 'grad_norm': 1.0750031471252441, 'learning_rate': 6.336650173127223e-08, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 793.48, 'epoch': 2.71}

 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 73/81 [30:17<03:15, 24.48s/it]
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 74/81 [30:40<02:48, 24.07s/it]
                                                                                                                                                                         
{'loss': 2.3341, 'grad_norm': 2.035534381866455, 'learning_rate': 5.017986827221732e-08, 'memory/max_active (GiB)': 58.54, 'memory/max_allocated (GiB)': 58.54, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 766.33, 'epoch': 2.74}

 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 74/81 [30:40<02:48, 24.07s/it]
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 75/81 [31:04<02:24, 24.10s/it]
                                                                                                                                                                         
{'loss': 1.6347, 'grad_norm': 1.792739748954773, 'learning_rate': 3.849509567257958e-08, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 786.7, 'epoch': 2.78}

 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 75/81 [31:04<02:24, 24.10s/it]
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 76/81 [31:32<02:06, 25.35s/it]
                                                                                                                                                                         
{'loss': 1.4495, 'grad_norm': 1.2769355773925781, 'learning_rate': 2.8330659959589942e-08, 'memory/max_active (GiB)': 86.1, 'memory/max_allocated (GiB)': 86.1, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1065.84, 'epoch': 2.82}

 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 76/81 [31:32<02:06, 25.35s/it]
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 77/81 [31:56<01:39, 24.82s/it]
                                                                                                                                                                         
{'loss': 2.2957, 'grad_norm': 1.664533257484436, 'learning_rate': 1.9702633195363917e-08, 'memory/max_active (GiB)': 60.57, 'memory/max_allocated (GiB)': 60.57, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 865.29, 'epoch': 2.85}

 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 77/81 [31:56<01:39, 24.82s/it]
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 78/81 [32:24<01:17, 25.73s/it]
                                                                                                                                                                         
{'loss': 1.8847, 'grad_norm': 1.805460810661316, 'learning_rate': 1.2624658063666638e-08, 'memory/max_active (GiB)': 84.93, 'memory/max_allocated (GiB)': 84.93, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 789.11, 'epoch': 2.89}

 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 78/81 [32:24<01:17, 25.73s/it]
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 79/81 [32:49<00:51, 25.71s/it]
                                                                                                                                                                         
{'loss': 0.6906, 'grad_norm': 0.49316051602363586, 'learning_rate': 7.10792629802659e-09, 'memory/max_active (GiB)': 79.04, 'memory/max_allocated (GiB)': 79.04, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1009.91, 'epoch': 2.93}

 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 79/81 [32:49<00:51, 25.71s/it]
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 80/81 [33:16<00:26, 26.12s/it]
                                                                                                                                                                         
{'loss': 1.9236, 'grad_norm': 1.247730016708374, 'learning_rate': 3.1611609853041676e-09, 'memory/max_active (GiB)': 81.45, 'memory/max_allocated (GiB)': 81.45, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.97, 'epoch': 2.97}

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 80/81 [33:16<00:26, 26.12s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [33:40<00:00, 25.25s/it]
                                                                                                                                                                         
{'loss': 0.9602, 'grad_norm': 0.6849121451377869, 'learning_rate': 7.906027726981567e-10, 'memory/max_active (GiB)': 76.78, 'memory/max_allocated (GiB)': 76.78, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 870.01, 'epoch': 3.0}

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [33:40<00:00, 25.25s/it][2025-12-27 21:52:44,059] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-81

                                                                                                                                                                         
{'train_runtime': 2021.9142, 'train_samples_per_second': 5.128, 'train_steps_per_second': 0.04, 'train_loss': 1.5273678048893258, 'memory/max_active (GiB)': 4.6, 'memory/max_allocated (GiB)': 4.6, 'memory/device_reserved (GiB)': 90.68, 'epoch': 3.0}

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [33:41<00:00, 25.25s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [33:41<00:00, 24.96s/it]
[2025-12-27 21:52:46,295] [INFO] [axolotl.train.save_trained_model:218] [PID:122677] Training completed! Saving trained model to ./outputs/mymodel.
[2025-12-27 21:52:47,133] [INFO] [axolotl.train.save_trained_model:336] [PID:122677] Model successfully saved to ./outputs/mymodel