| [2025-12-27 21:18:07,941] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:122677] bf16 support detected, enabling for this configuration. | |
| config.json: 0.00B [00:00, ?B/s] config.json: 1.54kB [00:00, 6.02MB/s] | |
| [2025-12-27 21:18:08,103] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:122677] baseline 0.000GB () | |
| [2025-12-27 21:18:08,106] [INFO] [axolotl.cli.config.load_cfg:248] [PID:122677] config: | |
| { | |
| "activation_offloading": false, | |
| "adapter": "lora", | |
| "axolotl_config_path": "config.yaml", | |
| "base_model": "BKM1804/affine-he-CIVICbeatPORSCHE", | |
| "base_model_config": "BKM1804/affine-he-CIVICbeatPORSCHE", | |
| "batch_size": 128, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_90", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "context_parallel_size": 1, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_processes": 18, | |
| "datasets": [ | |
| { | |
| "chat_template": "tokenizer_default", | |
| "field_messages": "messages", | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "/workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl", | |
| "split": "train", | |
| "trust_remote_code": false, | |
| "type": "chat_template" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.7.1" | |
| }, | |
| "eval_batch_size": 2, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 64, | |
| "gradient_checkpointing": false, | |
| "include_tkps": true, | |
| "learning_rate": 2e-06, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": true, | |
| "local_rank": 0, | |
| "lora_alpha": 64, | |
| "lora_dropout": 0.05, | |
| "lora_r": 32, | |
| "lora_target_modules": [ | |
| "q_proj", | |
| "v_proj", | |
| "k_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "down_proj", | |
| "up_proj" | |
| ], | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "cosine", | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 2, | |
| "model_config_type": "qwen3", | |
| "num_epochs": 3.0, | |
| "optimizer": "adamw_bnb_8bit", | |
| "output_dir": "./outputs/mymodel", | |
| "pretrain_multipack_attn": true, | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": false, | |
| "save_safetensors": true, | |
| "sequence_len": 4096, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "BKM1804/affine-he-CIVICbeatPORSCHE", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "use_ray": false, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "weight_decay": 0.0, | |
| "world_size": 1 | |
| } | |
| tokenizer_config.json: 0.00B [00:00, ?B/s] tokenizer_config.json: 5.40kB [00:00, 22.8MB/s] | |
| vocab.json: 0.00B [00:00, ?B/s] vocab.json: 32.8kB [00:00, 297kB/s] vocab.json: 1.66MB [00:00, 7.30MB/s] vocab.json: 2.78MB [00:00, 10.2MB/s] | |
| merges.txt: 0.00B [00:00, ?B/s] merges.txt: 43.4kB [00:00, 357kB/s] merges.txt: 1.67MB [00:00, 8.66MB/s] | |
| tokenizer.json: 0%| | 0.00/11.4M [00:00<?, ?B/s] tokenizer.json: 3%|ââââ | 329k/11.4M [00:00<00:16, 682kB/s] tokenizer.json: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 11.4M/11.4M [00:00<00:00, 25.0MB/s] tokenizer.json: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 11.4M/11.4M [00:00<00:00, 19.1MB/s] | |
| added_tokens.json: 0%| | 0.00/707 [00:00<?, ?B/s] added_tokens.json: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 707/707 [00:00<00:00, 5.99MB/s] | |
| special_tokens_map.json: 0%| | 0.00/613 [00:00<?, ?B/s] special_tokens_map.json: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 613/613 [00:00<00:00, 4.94MB/s] | |
| chat_template.jinja: 0.00B [00:00, ?B/s] chat_template.jinja: 4.93kB [00:00, 23.1MB/s] | |
| [2025-12-27 21:18:10,826] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <|im_end|> | |
| [2025-12-27 21:18:10,827] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None | |
| [2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <|endoftext|> | |
| [2025-12-27 21:18:10,828] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None | |
| [2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:122677] Unable to find prepared dataset in last_run_prepared/f6b60198703671e2d2150636511428c1 | |
| [2025-12-27 21:18:10,829] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:122677] Loading raw datasets... | |
| [2025-12-27 21:18:10,829] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:122677] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. | |
| [2025-12-27 21:18:10,933] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:122677] Loading dataset: /workspace/fine-tuning/dataset/train_qwen3_lgc.jsonl with base_type: chat_template and prompt_style: None | |
| [2025-12-27 21:18:10,935] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:122677] Using chat template: | |
| --- | |
| {%- set ns = namespace(last_query_index=-1) %} | |
| {%- for message in messages %} | |
| {%- if message.role == "user" %} | |
| {%- set ns.last_query_index = loop.index0 %} | |
| {%- endif %} | |
| {%- endfor %} | |
| {%- if tools %} | |
| {{- '<|im_start|>system\n' }} | |
| {%- if messages[0].role == 'system' %} | |
| {{- messages[0].content + '\n\n' }} | |
| {%- endif %} | |
| {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} | |
| {%- for tool in tools %} | |
| {{- "\n" }} | |
| {{- tool | tojson }} | |
| {%- endfor %} | |
| {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} | |
| {%- else %} | |
| {%- if messages[0].role == 'system' %} | |
| {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} | |
| {%- endif %} | |
| {%- endif %} | |
| {%- for message in messages %} | |
| {%- if message.content is string %} | |
| {%- set content = message.content %} | |
| {%- else %} | |
| {%- set content = '' %} | |
| {%- endif %} | |
| {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} | |
| {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} | |
| {%- elif message.role == "assistant" %} | |
| {%- set has_loss = (message.loss is defined and message.loss) %} | |
| {%- set reasoning_content = '' %} | |
| {%- if message.reasoning_content is string %} | |
| {%- set reasoning_content = message.reasoning_content %} | |
| {%- else %} | |
| {%- if '</think>' in content %} | |
| {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %} | |
| {%- set content = content.split('</think>')[-1].lstrip('\n') %} | |
| {%- endif %} | |
| {%- endif %} | |
| {{- '<|im_start|>' + message.role + '\n' }} | |
| {%- if has_loss -%} | |
| {%- generation -%} | |
| {%- if loop.index0 > ns.last_query_index and reasoning_content %} | |
| {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }} | |
| {%- else %} | |
| {{- content }} | |
| {%- endif %} | |
| {%- if message.tool_calls %} | |
| {%- for tool_call in message.tool_calls %} | |
| {%- if (loop.first and content) or (not loop.first) %} | |
| {{- '\n' }} | |
| {%- endif %} | |
| {%- if tool_call.function %} | |
| {%- set tool_call = tool_call.function %} | |
| {%- endif %} | |
| {{- '<tool_call>\n{"name": "' }} | |
| {{- tool_call.name }} | |
| {{- '", "arguments": ' }} | |
| {%- if tool_call.arguments is string %} | |
| {{- tool_call.arguments }} | |
| {%- else %} | |
| {{- tool_call.arguments | tojson }} | |
| {%- endif %} | |
| {{- '}\n</tool_call>' }} | |
| {%- endfor %} | |
| {%- endif %} | |
| {{- '<|im_end|>' }} | |
| {%- endgeneration -%} | |
| {{- '\n' }} | |
| {%- else -%} | |
| {%- if loop.index0 > ns.last_query_index and reasoning_content %} | |
| {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }} | |
| {%- else %} | |
| {{- content }} | |
| {%- endif %} | |
| {%- if message.tool_calls %} | |
| {%- for tool_call in message.tool_calls %} | |
| {%- if (loop.first and content) or (not loop.first) %} | |
| {{- '\n' }} | |
| {%- endif %} | |
| {%- if tool_call.function %} | |
| {%- set tool_call = tool_call.function %} | |
| {%- endif %} | |
| {{- '<tool_call>\n{"name": "' }} | |
| {{- tool_call.name }} | |
| {{- '", "arguments": ' }} | |
| {%- if tool_call.arguments is string %} | |
| {{- tool_call.arguments }} | |
| {%- else %} | |
| {{- tool_call.arguments | tojson }} | |
| {%- endif %} | |
| {{- '}\n</tool_call>' }} | |
| {%- endfor %} | |
| {%- endif %} | |
| {{- '<|im_end|>\n' }} | |
| {%- endif %} | |
| {%- elif message.role == "tool" %} | |
| {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} | |
| {{- '<|im_start|>user' }} | |
| {%- endif %} | |
| {{- '\n<tool_response>\n' }} | |
| {{- content }} | |
| {{- '\n</tool_response>' }} | |
| {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} | |
| {{- '<|im_end|>\n' }} | |
| {%- endif %} | |
| {%- endif %} | |
| {%- endfor %} | |
| {%- if add_generation_prompt %} | |
| {{- '<|im_start|>assistant\n' }} | |
| {%- endif %} | |
| --- | |
| Tokenizing Prompts (num_proc=18): 0%| | 0/3494 [00:00<?, ? examples/s] Tokenizing Prompts (num_proc=18): 6%|âââââ | 195/3494 [00:01<00:18, 179.32 examples/s] Tokenizing Prompts (num_proc=18): 11%|ââââââââââ | 390/3494 [00:01<00:08, 380.73 examples/s] Tokenizing Prompts (num_proc=18): 17%|âââââââââââââââ | 584/3494 [00:01<00:04, 583.45 examples/s] Tokenizing Prompts (num_proc=18): 28%|ââââââââââââââââââââââââ | 972/3494 [00:01<00:02, 1112.10 examples/s] Tokenizing Prompts (num_proc=18): 39%|âââââââââââââââââââââââââââââââââ | 1360/3494 [00:01<00:01, 1564.38 examples/s] Tokenizing Prompts (num_proc=18): 50%|âââââââââââââââââââââââââââââââââââââââââââ | 1748/3494 [00:01<00:00, 2022.08 examples/s] Tokenizing Prompts (num_proc=18): 61%|ââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2136/3494 [00:01<00:00, 1942.13 examples/s] Tokenizing Prompts (num_proc=18): 89%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3106/3494 [00:02<00:00, 3193.48 examples/s] Tokenizing Prompts (num_proc=18): 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3494/3494 [00:02<00:00, 2646.33 examples/s] Tokenizing Prompts (num_proc=18): 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3494/3494 [00:02<00:00, 1446.93 examples/s] | |
| [2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:122677] min_input_len: 64 | |
| [2025-12-27 21:18:13,494] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:122677] max_input_len: 24840 | |
| Dropping Long Sequences (>4096) (num_proc=18): 0%| | 0/3494 [00:00<?, ? examples/s] Dropping Long Sequences (>4096) (num_proc=18): 6%|âââââ | 195/3494 [00:00<00:05, 564.88 examples/s] Dropping Long Sequences (>4096) (num_proc=18): 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3494/3494 [00:00<00:00, 6342.01 examples/s] | |
| [2025-12-27 21:18:14,100] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:122677] Dropped 49 samples from dataset | |
| Saving the dataset (0/13 shards): 0%| | 0/3445 [00:00<?, ? examples/s] Saving the dataset (0/13 shards): 8%|âââââââ | 265/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (1/13 shards): 8%|âââââââ | 265/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (2/13 shards): 15%|ââââââââââââââ | 530/3445 [00:00<00:03, 901.51 examples/s] Saving the dataset (3/13 shards): 23%|ââââââââââââââââââââ | 795/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (4/13 shards): 31%|âââââââââââââââââââââââââââ | 1060/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (5/13 shards): 38%|âââââââââââââââââââââââââââââââââ | 1325/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (6/13 shards): 46%|ââââââââââââââââââââââââââââââââââââââââ | 1590/3445 [00:00<00:02, 901.51 examples/s] Saving the dataset (7/13 shards): 54%|âââââââââââââââââââââââââââââââââââââââââââââââ | 1855/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (8/13 shards): 62%|âââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2120/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (9/13 shards): 69%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2385/3445 [00:00<00:01, 901.51 examples/s] Saving the dataset (10/13 shards): 77%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2650/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (11/13 shards): 85%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2915/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (12/13 shards): 92%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3180/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (13/13 shards): 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3445/3445 [00:00<00:00, 901.51 examples/s] Saving the dataset (13/13 shards): 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3445/3445 [00:00<00:00, 8496.02 examples/s] | |
| [2025-12-27 21:18:14,663] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:122677] total_num_tokens: 1_863_059 | |
| [2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:122677] `total_supervised_tokens: 888_884` | |
| [2025-12-27 21:18:14,695] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:122677] total_num_steps: 81 | |
| [2025-12-27 21:18:14,696] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:122677] Maximum number of steps set at 81 | |
| [2025-12-27 21:18:14,722] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:122677] Loading tokenizer... BKM1804/affine-he-CIVICbeatPORSCHE | |
| [2025-12-27 21:18:15,206] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:122677] EOS: 151645 / <|im_end|> | |
| [2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:122677] BOS: None / None | |
| [2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:122677] PAD: 151643 / <|endoftext|> | |
| [2025-12-27 21:18:15,207] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:122677] UNK: None / None | |
| [2025-12-27 21:18:15,208] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:122677] Loading model | |
| [2025-12-27 21:18:15,257] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:122677] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2025-12-27 21:18:15,258] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:122677] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| model.safetensors.index.json: 0.00B [00:00, ?B/s] model.safetensors.index.json: 32.9kB [00:00, 88.3MB/s] | |
| model-00001-of-00002.safetensors: 0%| | 0.00/4.97G [00:00<?, ?B/s] model-00001-of-00002.safetensors: 0%| | 680k/4.97G [00:00<2:00:53, 685kB/s] model-00001-of-00002.safetensors: 0%| | 2.84M/4.97G [00:01<39:57, 2.07MB/s] model-00001-of-00002.safetensors: 1%|ââ | 70.5M/4.97G [00:01<01:22, 59.1MB/s] model-00001-of-00002.safetensors: 3%|âââ | 138M/4.97G [00:02<00:49, 98.0MB/s] model-00001-of-00002.safetensors: 4%|ââââ | 205M/4.97G [00:02<00:33, 141MB/s] model-00001-of-00002.safetensors: 5%|ââââââ | 272M/4.97G [00:02<00:25, 185MB/s] model-00001-of-00002.safetensors: 7%|âââââââ | 339M/4.97G [00:02<00:20, 222MB/s] model-00001-of-00002.safetensors: 8%|ââââââââ | 406M/4.97G [00:03<00:19, 235MB/s] model-00001-of-00002.safetensors: 10%|âââââââââ | 473M/4.97G [00:03<00:20, 225MB/s] model-00001-of-00002.safetensors: 11%|âââââââââââ | 540M/4.97G [00:03<00:20, 215MB/s] model-00001-of-00002.safetensors: 12%|ââââââââââââ | 607M/4.97G [00:04<00:20, 210MB/s] model-00001-of-00002.safetensors: 14%|âââââââââââââ | 674M/4.97G [00:04<00:20, 209MB/s] model-00001-of-00002.safetensors: 15%|âââââââââââââââ | 741M/4.97G [00:04<00:17, 237MB/s] model-00001-of-00002.safetensors: 16%|ââââââââââââââââ | 808M/4.97G [00:04<00:15, 262MB/s] model-00001-of-00002.safetensors: 18%|âââââââââââââââââ | 875M/4.97G [00:04<00:14, 286MB/s] model-00001-of-00002.safetensors: 19%|ââââââââââââââââââ | 942M/4.97G [00:05<00:13, 294MB/s] model-00001-of-00002.safetensors: 20%|âââââââââââââââââââ | 1.01G/4.97G [00:05<00:15, 260MB/s] model-00001-of-00002.safetensors: 22%|âââââââââââââââââââââ | 1.08G/4.97G [00:05<00:16, 237MB/s] model-00001-of-00002.safetensors: 23%|ââââââââââââââââââââââ | 1.14G/4.97G [00:06<00:15, 251MB/s] model-00001-of-00002.safetensors: 24%|âââââââââââââââââââââââ | 1.21G/4.97G [00:06<00:14, 262MB/s] model-00001-of-00002.safetensors: 26%|âââââââââââââââââââââââââ | 1.28G/4.97G [00:06<00:12, 289MB/s] model-00001-of-00002.safetensors: 27%|ââââââââââââââââââââââââââ | 1.34G/4.97G [00:06<00:12, 283MB/s] model-00001-of-00002.safetensors: 28%|âââââââââââââââââââââââââââ | 1.41G/4.97G [00:06<00:11, 298MB/s] model-00001-of-00002.safetensors: 30%|ââââââââââââââââââââââââââââ | 1.48G/4.97G [00:07<00:10, 334MB/s] model-00001-of-00002.safetensors: 31%|ââââââââââââââââââââââââââââââ | 1.55G/4.97G [00:07<00:10, 318MB/s] model-00001-of-00002.safetensors: 32%|âââââââââââââââââââââââââââââââ | 1.61G/4.97G [00:07<00:12, 265MB/s] model-00001-of-00002.safetensors: 34%|ââââââââââââââââââââââââââââââââ | 1.68G/4.97G [00:07<00:11, 287MB/s] model-00001-of-00002.safetensors: 35%|âââââââââââââââââââââââââââââââââ | 1.75G/4.97G [00:08<00:11, 291MB/s] model-00001-of-00002.safetensors: 36%|âââââââââââââââââââââââââââââââââââ | 1.81G/4.97G [00:08<00:10, 290MB/s] model-00001-of-00002.safetensors: 38%|ââââââââââââââââââââââââââââââââââââ | 1.88G/4.97G [00:08<00:10, 301MB/s] model-00001-of-00002.safetensors: 39%|âââââââââââââââââââââââââââââââââââââ | 1.95G/4.97G [00:08<00:09, 305MB/s] model-00001-of-00002.safetensors: 41%|ââââââââââââââââââââââââââââââââââââââ | 2.01G/4.97G [00:08<00:08, 333MB/s] model-00001-of-00002.safetensors: 42%|ââââââââââââââââââââââââââââââââââââââââ | 2.08G/4.97G [00:09<00:08, 346MB/s] model-00001-of-00002.safetensors: 43%|âââââââââââââââââââââââââââââââââââââââââ | 2.15G/4.97G [00:09<00:07, 353MB/s] model-00001-of-00002.safetensors: 45%|ââââââââââââââââââââââââââââââââââââââââââ | 2.22G/4.97G [00:09<00:07, 356MB/s] model-00001-of-00002.safetensors: 46%|ââââââââââââââââââââââââââââââââââââââââââââ | 2.28G/4.97G [00:09<00:07, 343MB/s] model-00001-of-00002.safetensors: 47%|âââââââââââââââââââââââââââââââââââââââââââââ | 2.35G/4.97G [00:09<00:07, 340MB/s] model-00001-of-00002.safetensors: 49%|ââââââââââââââââââââââââââââââââââââââââââââââ | 2.42G/4.97G [00:10<00:07, 320MB/s] model-00001-of-00002.safetensors: 50%|âââââââââââââââââââââââââââââââââââââââââââââââ | 2.48G/4.97G [00:10<00:07, 336MB/s] model-00001-of-00002.safetensors: 51%|âââââââââââââââââââââââââââââââââââââââââââââââââ | 2.55G/4.97G [00:10<00:07, 335MB/s] model-00001-of-00002.safetensors: 53%|ââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.62G/4.97G [00:10<00:07, 295MB/s] model-00001-of-00002.safetensors: 54%|âââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.68G/4.97G [00:10<00:07, 310MB/s] model-00001-of-00002.safetensors: 55%|ââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.75G/4.97G [00:11<00:07, 309MB/s] model-00001-of-00002.safetensors: 57%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.82G/4.97G [00:11<00:06, 308MB/s] model-00001-of-00002.safetensors: 58%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.89G/4.97G [00:11<00:06, 330MB/s] model-00001-of-00002.safetensors: 59%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.95G/4.97G [00:11<00:05, 344MB/s] model-00001-of-00002.safetensors: 61%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.02G/4.97G [00:11<00:05, 355MB/s] model-00001-of-00002.safetensors: 62%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.09G/4.97G [00:12<00:05, 360MB/s] model-00001-of-00002.safetensors: 64%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.15G/4.97G [00:12<00:06, 282MB/s] model-00001-of-00002.safetensors: 65%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.22G/4.97G [00:12<00:05, 312MB/s] model-00001-of-00002.safetensors: 66%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.29G/4.97G [00:12<00:05, 334MB/s] model-00001-of-00002.safetensors: 68%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.36G/4.97G [00:13<00:06, 253MB/s] model-00001-of-00002.safetensors: 69%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.43G/4.97G [00:13<00:05, 259MB/s] model-00001-of-00002.safetensors: 70%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.49G/4.97G [00:13<00:05, 264MB/s] model-00001-of-00002.safetensors: 72%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.56G/4.97G [00:13<00:05, 274MB/s] model-00001-of-00002.safetensors: 73%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.63G/4.97G [00:14<00:05, 260MB/s] model-00001-of-00002.safetensors: 74%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.70G/4.97G [00:14<00:04, 256MB/s] model-00001-of-00002.safetensors: 76%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.77G/4.97G [00:14<00:04, 274MB/s] model-00001-of-00002.safetensors: 77%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.83G/4.97G [00:14<00:03, 296MB/s] model-00001-of-00002.safetensors: 78%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.90G/4.97G [00:15<00:03, 317MB/s] model-00001-of-00002.safetensors: 80%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.96G/4.97G [00:15<00:03, 319MB/s] model-00001-of-00002.safetensors: 81%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.03G/4.97G [00:15<00:02, 324MB/s] model-00001-of-00002.safetensors: 83%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.10G/4.97G [00:15<00:02, 335MB/s] model-00001-of-00002.safetensors: 84%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.17G/4.97G [00:15<00:02, 321MB/s] model-00001-of-00002.safetensors: 85%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.23G/4.97G [00:16<00:02, 313MB/s] model-00001-of-00002.safetensors: 87%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.30G/4.97G [00:16<00:02, 311MB/s] model-00001-of-00002.safetensors: 88%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.37G/4.97G [00:16<00:02, 299MB/s] model-00001-of-00002.safetensors: 89%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.43G/4.97G [00:16<00:01, 322MB/s] model-00001-of-00002.safetensors: 91%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.50G/4.97G [00:16<00:01, 294MB/s] model-00001-of-00002.safetensors: 92%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.57G/4.97G [00:17<00:01, 313MB/s] model-00001-of-00002.safetensors: 93%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.63G/4.97G [00:17<00:01, 311MB/s] model-00001-of-00002.safetensors: 95%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.70G/4.97G [00:17<00:00, 320MB/s] model-00001-of-00002.safetensors: 96%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.77G/4.97G [00:17<00:00, 328MB/s] model-00001-of-00002.safetensors: 97%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.83G/4.97G [00:17<00:00, 326MB/s] model-00001-of-00002.safetensors: 99%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 4.90G/4.97G [00:18<00:00, 346MB/s] model-00001-of-00002.safetensors: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 4.97G/4.97G [00:18<00:00, 290MB/s] model-00001-of-00002.safetensors: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 4.97G/4.97G [00:18<00:00, 269MB/s] | |
| model-00002-of-00002.safetensors: 0%| | 0.00/3.08G [00:00<?, ?B/s] model-00002-of-00002.safetensors: 0%| | 547k/3.08G [00:00<1:11:35, 716kB/s] model-00002-of-00002.safetensors: 0%| | 1.87M/3.08G [00:01<25:07, 2.04MB/s] model-00002-of-00002.safetensors: 2%|ââ | 68.4M/3.08G [00:01<00:48, 62.3MB/s] model-00002-of-00002.safetensors: 4%|âââââ | 135M/3.08G [00:01<00:28, 102MB/s] model-00002-of-00002.safetensors: 7%|âââââââ | 203M/3.08G [00:02<00:20, 142MB/s] model-00002-of-00002.safetensors: 9%|âââââââââ | 270M/3.08G [00:02<00:17, 162MB/s] model-00002-of-00002.safetensors: 11%|âââââââââââ | 337M/3.08G [00:02<00:16, 165MB/s] model-00002-of-00002.safetensors: 13%|âââââââââââââ | 404M/3.08G [00:03<00:13, 200MB/s] model-00002-of-00002.safetensors: 15%|âââââââââââââââ | 471M/3.08G [00:03<00:15, 167MB/s] model-00002-of-00002.safetensors: 17%|âââââââââââââââââ | 538M/3.08G [00:04<00:15, 168MB/s] model-00002-of-00002.safetensors: 20%|âââââââââââââââââââ | 605M/3.08G [00:04<00:16, 154MB/s] model-00002-of-00002.safetensors: 22%|âââââââââââââââââââââ | 672M/3.08G [00:04<00:13, 182MB/s] model-00002-of-00002.safetensors: 24%|âââââââââââââââââââââââ | 739M/3.08G [00:04<00:11, 210MB/s] model-00002-of-00002.safetensors: 26%|âââââââââââââââââââââââââ | 806M/3.08G [00:05<00:11, 198MB/s] model-00002-of-00002.safetensors: 28%|âââââââââââââââââââââââââââ | 873M/3.08G [00:05<00:10, 216MB/s] model-00002-of-00002.safetensors: 31%|âââââââââââââââââââââââââââââ | 940M/3.08G [00:05<00:08, 245MB/s] model-00002-of-00002.safetensors: 33%|âââââââââââââââââââââââââââââââ | 1.01G/3.08G [00:05<00:07, 272MB/s] model-00002-of-00002.safetensors: 35%|âââââââââââââââââââââââââââââââââ | 1.07G/3.08G [00:06<00:07, 280MB/s] model-00002-of-00002.safetensors: 37%|âââââââââââââââââââââââââââââââââââ | 1.14G/3.08G [00:06<00:06, 295MB/s] model-00002-of-00002.safetensors: 39%|âââââââââââââââââââââââââââââââââââââ | 1.21G/3.08G [00:06<00:05, 315MB/s] model-00002-of-00002.safetensors: 41%|âââââââââââââââââââââââââââââââââââââââ | 1.28G/3.08G [00:06<00:06, 292MB/s] model-00002-of-00002.safetensors: 44%|âââââââââââââââââââââââââââââââââââââââââ | 1.34G/3.08G [00:07<00:05, 302MB/s] model-00002-of-00002.safetensors: 46%|âââââââââââââââââââââââââââââââââââââââââââ | 1.41G/3.08G [00:07<00:05, 303MB/s] model-00002-of-00002.safetensors: 48%|âââââââââââââââââââââââââââââââââââââââââââââ | 1.48G/3.08G [00:07<00:05, 319MB/s] model-00002-of-00002.safetensors: 50%|âââââââââââââââââââââââââââââââââââââââââââââââ | 1.54G/3.08G [00:07<00:04, 318MB/s] model-00002-of-00002.safetensors: 52%|ââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.61G/3.08G [00:07<00:04, 339MB/s] model-00002-of-00002.safetensors: 54%|ââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.68G/3.08G [00:08<00:04, 337MB/s] model-00002-of-00002.safetensors: 57%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.74G/3.08G [00:08<00:04, 315MB/s] model-00002-of-00002.safetensors: 59%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.81G/3.08G [00:08<00:04, 291MB/s] model-00002-of-00002.safetensors: 61%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.88G/3.08G [00:08<00:03, 314MB/s] model-00002-of-00002.safetensors: 63%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1.95G/3.08G [00:08<00:03, 320MB/s] model-00002-of-00002.safetensors: 65%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.00G/3.08G [00:09<00:03, 311MB/s] model-00002-of-00002.safetensors: 67%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.07G/3.08G [00:09<00:03, 299MB/s] model-00002-of-00002.safetensors: 70%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.14G/3.08G [00:09<00:02, 320MB/s] model-00002-of-00002.safetensors: 72%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.21G/3.08G [00:09<00:02, 343MB/s] model-00002-of-00002.safetensors: 74%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.27G/3.08G [00:09<00:02, 355MB/s] model-00002-of-00002.safetensors: 76%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.34G/3.08G [00:10<00:02, 364MB/s] model-00002-of-00002.safetensors: 78%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.41G/3.08G [00:10<00:01, 367MB/s] model-00002-of-00002.safetensors: 80%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.47G/3.08G [00:10<00:01, 383MB/s] model-00002-of-00002.safetensors: 83%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.54G/3.08G [00:10<00:01, 380MB/s] model-00002-of-00002.safetensors: 85%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.61G/3.08G [00:10<00:01, 304MB/s] model-00002-of-00002.safetensors: 87%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.68G/3.08G [00:11<00:01, 281MB/s] model-00002-of-00002.safetensors: 89%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.74G/3.08G [00:11<00:01, 306MB/s] model-00002-of-00002.safetensors: 91%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.81G/3.08G [00:11<00:01, 259MB/s] model-00002-of-00002.safetensors: 93%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.88G/3.08G [00:11<00:00, 288MB/s] model-00002-of-00002.safetensors: 96%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 2.94G/3.08G [00:12<00:00, 299MB/s] model-00002-of-00002.safetensors: 98%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 3.01G/3.08G [00:12<00:00, 251MB/s] model-00002-of-00002.safetensors: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3.08G/3.08G [00:12<00:00, 281MB/s] model-00002-of-00002.safetensors: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 3.08G/3.08G [00:12<00:00, 244MB/s] | |
| Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s] Loading checkpoint shards: 50%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 1/2 [00:06<00:06, 6.04s/it] Loading checkpoint shards: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 2/2 [00:10<00:00, 5.01s/it] Loading checkpoint shards: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 2/2 [00:10<00:00, 5.16s/it] | |
| generation_config.json: 0%| | 0.00/188 [00:00<?, ?B/s] generation_config.json: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 188/188 [00:00<00:00, 2.83MB/s] | |
| [2025-12-27 21:18:58,111] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:122677] converting PEFT model w/ prepare_model_for_kbit_training | |
| [2025-12-27 21:18:58,113] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:122677] Converting modules to torch.bfloat16 | |
| [2025-12-27 21:18:58,117] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] Memory usage after model load 5.665GB (+5.665GB allocated, +5.826GB reserved) | |
| trainable params: 66,060,288 || all params: 4,088,528,384 || trainable%: 1.6157 | |
| [2025-12-27 21:18:58,545] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:122677] after adapters 4.411GB (+4.411GB allocated, +5.947GB reserved) | |
| [2025-12-27 21:19:03,382] [INFO] [axolotl.train.save_initial_configs:398] [PID:122677] Pre-saving adapter config to ./outputs/mymodel... | |
| [2025-12-27 21:19:03,388] [INFO] [axolotl.train.save_initial_configs:402] [PID:122677] Pre-saving tokenizer to ./outputs/mymodel... | |
| [2025-12-27 21:19:03,587] [INFO] [axolotl.train.save_initial_configs:407] [PID:122677] Pre-saving model config to ./outputs/mymodel... | |
| [2025-12-27 21:19:03,594] [INFO] [axolotl.train.execute_training:196] [PID:122677] Starting trainer... | |
| 0%| | 0/81 [00:00<?, ?it/s][2025-12-27 21:19:05,215] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization | |
| warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") | |
| 1%|ââ | 1/81 [00:26<34:57, 26.22s/it] {'loss': 0.7532, 'grad_norm': 0.6496106386184692, 'learning_rate': 0.0, 'memory/max_active (GiB)': 76.91, 'memory/max_allocated (GiB)': 76.91, 'memory/device_reserved (GiB)': 79.86, 'tokens_per_second_per_gpu': 877.99, 'epoch': 0.04} | |
| 1%|ââ | 1/81 [00:26<34:57, 26.22s/it] 2%|ââââ | 2/81 [00:48<31:41, 24.06s/it] {'loss': 0.6528, 'grad_norm': 0.46730196475982666, 'learning_rate': 1e-06, 'memory/max_active (GiB)': 46.79, 'memory/max_allocated (GiB)': 46.79, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 754.12, 'epoch': 0.07} | |
| 2%|ââââ | 2/81 [00:48<31:41, 24.06s/it] 4%|âââââ | 3/81 [01:14<32:03, 24.66s/it] {'loss': 2.2564, 'grad_norm': 0.6027721762657166, 'learning_rate': 2e-06, 'memory/max_active (GiB)': 57.88, 'memory/max_allocated (GiB)': 57.88, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.03, 'epoch': 0.11} | |
| 4%|âââââ | 3/81 [01:14<32:03, 24.66s/it] 5%|âââââââ | 4/81 [01:38<31:33, 24.59s/it] {'loss': 2.3578, 'grad_norm': 1.7880005836486816, 'learning_rate': 1.9992093972273017e-06, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 879.33, 'epoch': 0.15} | |
| 5%|âââââââ | 4/81 [01:38<31:33, 24.59s/it] 6%|âââââââââ | 5/81 [02:02<30:38, 24.18s/it] {'loss': 0.9175, 'grad_norm': 0.5934199690818787, 'learning_rate': 1.9968388390146957e-06, 'memory/max_active (GiB)': 57.81, 'memory/max_allocated (GiB)': 57.81, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 759.41, 'epoch': 0.19} | |
| 6%|âââââââââ | 5/81 [02:02<30:38, 24.18s/it] 7%|ââââââââââ | 6/81 [02:25<29:58, 23.97s/it] {'loss': 1.6229, 'grad_norm': 1.5621187686920166, 'learning_rate': 1.992892073701973e-06, 'memory/max_active (GiB)': 68.44, 'memory/max_allocated (GiB)': 68.44, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 949.82, 'epoch': 0.22} | |
| 7%|ââââââââââ | 6/81 [02:25<29:58, 23.97s/it] 9%|ââââââââââââ | 7/81 [02:51<30:10, 24.47s/it] {'loss': 2.9604, 'grad_norm': 1.206151008605957, 'learning_rate': 1.987375341936333e-06, 'memory/max_active (GiB)': 60.61, 'memory/max_allocated (GiB)': 60.61, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 862.01, 'epoch': 0.26} | |
| 9%|ââââââââââââ | 7/81 [02:51<30:10, 24.47s/it] 10%|ââââââââââââââ | 8/81 [03:15<29:40, 24.39s/it] {'loss': 0.8612, 'grad_norm': 0.6299921870231628, 'learning_rate': 1.9802973668046363e-06, 'memory/max_active (GiB)': 44.43, 'memory/max_allocated (GiB)': 44.43, 'memory/device_reserved (GiB)': 79.98, 'tokens_per_second_per_gpu': 861.47, 'epoch': 0.3} | |
| 10%|ââââââââââââââ | 8/81 [03:15<29:40, 24.39s/it] 11%|âââââââââââââââ | 9/81 [03:41<30:02, 25.03s/it] {'loss': 1.2738, 'grad_norm': 1.1461963653564453, 'learning_rate': 1.9716693400404097e-06, 'memory/max_active (GiB)': 88.86, 'memory/max_allocated (GiB)': 88.86, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.2, 'epoch': 0.33} | |
| 11%|âââââââââââââââ | 9/81 [03:41<30:02, 25.03s/it] 12%|âââââââââââââââââ | 10/81 [04:06<29:20, 24.80s/it] {'loss': 0.9472, 'grad_norm': 0.4421791732311249, 'learning_rate': 1.9615049043274204e-06, 'memory/max_active (GiB)': 84.65, 'memory/max_allocated (GiB)': 84.65, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 819.72, 'epoch': 0.37} | |
| 12%|âââââââââââââââââ | 10/81 [04:06<29:20, 24.80s/it] 14%|ââââââââââââââââââ | 11/81 [04:32<29:37, 25.40s/it] {'loss': 1.3682, 'grad_norm': 1.287254810333252, 'learning_rate': 1.949820131727783e-06, 'memory/max_active (GiB)': 86.21, 'memory/max_allocated (GiB)': 86.21, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1004.54, 'epoch': 0.41} | |
| 14%|ââââââââââââââââââ | 11/81 [04:32<29:37, 25.40s/it] 15%|ââââââââââââââââââââ | 12/81 [04:55<28:20, 24.65s/it] {'loss': 1.3094, 'grad_norm': 0.8032840490341187, 'learning_rate': 1.936633498268728e-06, 'memory/max_active (GiB)': 60.39, 'memory/max_allocated (GiB)': 60.39, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 900.54, 'epoch': 0.45} | |
| 15%|ââââââââââââââââââââ | 12/81 [04:55<28:20, 24.65s/it] 16%|ââââââââââââââââââââââ | 13/81 [05:19<27:30, 24.27s/it] {'loss': 1.7079, 'grad_norm': 0.835660457611084, 'learning_rate': 1.9219658547282065e-06, 'memory/max_active (GiB)': 58.53, 'memory/max_allocated (GiB)': 58.53, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 804.11, 'epoch': 0.48} | |
| 16%|ââââââââââââââââââââââ | 13/81 [05:19<27:30, 24.27s/it] 17%|âââââââââââââââââââââââ | 14/81 [05:43<27:02, 24.21s/it] {'loss': 1.4821, 'grad_norm': 0.7599063515663147, 'learning_rate': 1.9058403936655232e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 771.26, 'epoch': 0.52} | |
| 17%|âââââââââââââââââââââââ | 14/81 [05:43<27:02, 24.21s/it] 19%|âââââââââââââââââââââââââ | 15/81 [06:09<27:17, 24.81s/it] {'loss': 1.1628, 'grad_norm': 0.43679726123809814, 'learning_rate': 1.8882826127491318e-06, 'memory/max_active (GiB)': 79.08, 'memory/max_allocated (GiB)': 79.08, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 897.62, 'epoch': 0.56} | |
| 19%|âââââââââââââââââââââââââ | 15/81 [06:09<27:17, 24.81s/it] 20%|ââââââââââââââââââââââââââ | 16/81 [06:34<26:55, 24.85s/it] {'loss': 2.6527, 'grad_norm': 0.6629171967506409, 'learning_rate': 1.8693202744395827e-06, 'memory/max_active (GiB)': 56.2, 'memory/max_allocated (GiB)': 56.2, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 881.06, 'epoch': 0.59} | |
| 20%|ââââââââââââââââââââââââââ | 16/81 [06:34<26:55, 24.85s/it] 21%|ââââââââââââââââââââââââââââ | 17/81 [06:59<26:31, 24.86s/it] {'loss': 1.5066, 'grad_norm': 0.6648272275924683, 'learning_rate': 1.848983362091364e-06, 'memory/max_active (GiB)': 85.82, 'memory/max_allocated (GiB)': 85.82, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 0.63} | |
| 21%|ââââââââââââââââââââââââââââ | 17/81 [06:59<26:31, 24.86s/it] 22%|ââââââââââââââââââââââââââââââ | 18/81 [07:24<26:10, 24.93s/it] {'loss': 2.1366, 'grad_norm': 1.9141907691955566, 'learning_rate': 1.8273040325430573e-06, 'memory/max_active (GiB)': 58.11, 'memory/max_allocated (GiB)': 58.11, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 996.41, 'epoch': 0.67} | |
| 22%|ââââââââââââââââââââââââââââââ | 18/81 [07:24<26:10, 24.93s/it] 23%|âââââââââââââââââââââââââââââââ | 19/81 [07:49<25:42, 24.88s/it] {'loss': 1.7062, 'grad_norm': 1.0429956912994385, 'learning_rate': 1.8043165652707648e-06, 'memory/max_active (GiB)': 69.3, 'memory/max_allocated (GiB)': 69.3, 'memory/device_reserved (GiB)': 92.14, 'tokens_per_second_per_gpu': 839.76, 'epoch': 0.71} | |
| 23%|âââââââââââââââââââââââââââââââ | 19/81 [07:49<25:42, 24.88s/it] 25%|âââââââââââââââââââââââââââââââââ | 20/81 [08:16<26:07, 25.69s/it] {'loss': 1.7071, 'grad_norm': 0.8590395450592041, 'learning_rate': 1.780057308185212e-06, 'memory/max_active (GiB)': 89.48, 'memory/max_allocated (GiB)': 89.48, 'memory/device_reserved (GiB)': 92.15, 'tokens_per_second_per_gpu': 951.25, 'epoch': 0.74} | |
| 25%|âââââââââââââââââââââââââââââââââ | 20/81 [08:16<26:07, 25.69s/it] 26%|âââââââââââââââââââââââââââââââââââ | 21/81 [08:42<25:35, 25.60s/it] {'loss': 1.4588, 'grad_norm': 0.7291064858436584, 'learning_rate': 1.75456462015823e-06, 'memory/max_active (GiB)': 89.39, 'memory/max_allocated (GiB)': 89.39, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 828.05, 'epoch': 0.78} | |
| 26%|âââââââââââââââââââââââââââââââââââ | 21/81 [08:42<25:35, 25.60s/it] 27%|ââââââââââââââââââââââââââââââââââââ | 22/81 [09:08<25:26, 25.87s/it] {'loss': 1.8362, 'grad_norm': 1.2527186870574951, 'learning_rate': 1.7278788103694942e-06, 'memory/max_active (GiB)': 68.62, 'memory/max_allocated (GiB)': 68.62, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 817.67, 'epoch': 0.82} | |
| 27%|ââââââââââââââââââââââââââââââââââââ | 22/81 [09:08<25:26, 25.87s/it] 28%|ââââââââââââââââââââââââââââââââââââââ | 23/81 [09:34<24:59, 25.86s/it] {'loss': 1.5874, 'grad_norm': 1.3045358657836914, 'learning_rate': 1.7000420745694253e-06, 'memory/max_active (GiB)': 78.8, 'memory/max_allocated (GiB)': 78.8, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 864.14, 'epoch': 0.85} | |
| 28%|ââââââââââââââââââââââââââââââââââââââ | 23/81 [09:34<24:59, 25.86s/it] 30%|âââââââââââââââââââââââââââââââââââââââ | 24/81 [10:00<24:31, 25.82s/it] {'loss': 2.1574, 'grad_norm': 1.6913419961929321, 'learning_rate': 1.6710984283590367e-06, 'memory/max_active (GiB)': 81.59, 'memory/max_allocated (GiB)': 81.59, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 897.53, 'epoch': 0.89} | |
| 30%|âââââââââââââââââââââââââââââââââââââââ | 24/81 [10:00<24:31, 25.82s/it] 31%|âââââââââââââââââââââââââââââââââââââââââ | 25/81 [10:24<23:40, 25.37s/it] {'loss': 1.058, 'grad_norm': 0.7722698450088501, 'learning_rate': 1.64109363759222e-06, 'memory/max_active (GiB)': 61.45, 'memory/max_allocated (GiB)': 61.45, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 942.84, 'epoch': 0.93} | |
| 31%|âââââââââââââââââââââââââââââââââââââââââ | 25/81 [10:24<23:40, 25.37s/it] 32%|âââââââââââââââââââââââââââââââââââââââââââ | 26/81 [10:50<23:33, 25.70s/it] {'loss': 0.9906, 'grad_norm': 0.9898315072059631, 'learning_rate': 1.6100751460105243e-06, 'memory/max_active (GiB)': 86.27, 'memory/max_allocated (GiB)': 86.27, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 910.63, 'epoch': 0.97} | |
| 32%|âââââââââââââââââââââââââââââââââââââââââââ | 26/81 [10:50<23:33, 25.70s/it] 33%|ââââââââââââââââââââââââââââââââââââââââââââ | 27/81 [11:11<21:51, 24.28s/it] {'loss': 0.716, 'grad_norm': 0.5236030220985413, 'learning_rate': 1.5780920002248483e-06, 'memory/max_active (GiB)': 45.08, 'memory/max_allocated (GiB)': 45.08, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 793.15, 'epoch': 1.0} | |
| 33%|ââââââââââââââââââââââââââââââââââââââââââââ | 27/81 [11:11<21:51, 24.28s/it][2025-12-27 21:30:15,957] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-27 | |
| [2025-12-27 21:30:18,782] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization | |
| warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") | |
| 35%|ââââââââââââââââââââââââââââââââââââââââââââââ | 28/81 [11:37<21:44, 24.62s/it] {'loss': 1.4101, 'grad_norm': 0.6720283031463623, 'learning_rate': 1.5451947721626675e-06, 'memory/max_active (GiB)': 49.49, 'memory/max_allocated (GiB)': 49.49, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 713.64, 'epoch': 1.04} | |
| 35%|ââââââââââââââââââââââââââââââââââââââââââââââ | 28/81 [11:37<21:44, 24.62s/it] 36%|ââââââââââââââââââââââââââââââââââââââââââââââââ | 29/81 [12:01<21:16, 24.55s/it] {'loss': 2.9962, 'grad_norm': 2.1970348358154297, 'learning_rate': 1.5114354791034222e-06, 'memory/max_active (GiB)': 68.78, 'memory/max_allocated (GiB)': 68.78, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 887.43, 'epoch': 1.07} | |
| 36%|ââââââââââââââââââââââââââââââââââââââââââââââââ | 29/81 [12:01<21:16, 24.55s/it] 37%|âââââââââââââââââââââââââââââââââââââââââââââââââ | 30/81 [12:24<20:17, 23.88s/it] {'loss': 0.6654, 'grad_norm': 0.5169208645820618, 'learning_rate': 1.476867501428506e-06, 'memory/max_active (GiB)': 69.25, 'memory/max_allocated (GiB)': 69.25, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 837.12, 'epoch': 1.11} | |
| 37%|âââââââââââââââââââââââââââââââââââââââââââââââââ | 30/81 [12:24<20:17, 23.88s/it] 38%|âââââââââââââââââââââââââââââââââââââââââââââââââââ | 31/81 [12:50<20:28, 24.58s/it] {'loss': 2.3111, 'grad_norm': 1.6675411462783813, 'learning_rate': 1.4415454982159118e-06, 'memory/max_active (GiB)': 84.87, 'memory/max_allocated (GiB)': 84.87, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 893.34, 'epoch': 1.15} | |
| 38%|âââââââââââââââââââââââââââââââââââââââââââââââââââ | 31/81 [12:50<20:28, 24.58s/it] 40%|âââââââââââââââââââââââââââââââââââââââââââââââââââââ | 32/81 [13:13<19:47, 24.24s/it] {'loss': 1.454, 'grad_norm': 0.8987345695495605, 'learning_rate': 1.4055253208129937e-06, 'memory/max_active (GiB)': 54.96, 'memory/max_allocated (GiB)': 54.96, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 747.48, 'epoch': 1.19} | |
| 40%|âââââââââââââââââââââââââââââââââââââââââââââââââââââ | 32/81 [13:13<19:47, 24.24s/it] 41%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 33/81 [13:39<19:45, 24.71s/it] {'loss': 2.1194, 'grad_norm': 2.0316708087921143, 'learning_rate': 1.3688639245240078e-06, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 880.1, 'epoch': 1.22} | |
| 41%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 33/81 [13:39<19:45, 24.71s/it] 42%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 34/81 [14:05<19:37, 25.04s/it] {'loss': 1.3693, 'grad_norm': 0.9677668213844299, 'learning_rate': 1.3316192785520678e-06, 'memory/max_active (GiB)': 52.15, 'memory/max_allocated (GiB)': 52.15, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 903.2, 'epoch': 1.26} | |
| 42%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 34/81 [14:05<19:37, 25.04s/it] 43%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 35/81 [14:32<19:46, 25.80s/it] {'loss': 1.5613, 'grad_norm': 1.5035587549209595, 'learning_rate': 1.2938502743379209e-06, 'memory/max_active (GiB)': 89.55, 'memory/max_allocated (GiB)': 89.55, 'memory/device_reserved (GiB)': 90.6, 'tokens_per_second_per_gpu': 1018.31, 'epoch': 1.3} | |
| 43%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 35/81 [14:32<19:46, 25.80s/it] 44%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 36/81 [14:56<18:53, 25.18s/it] {'loss': 1.6472, 'grad_norm': 1.0176411867141724, 'learning_rate': 1.2556166324404746e-06, 'memory/max_active (GiB)': 89.41, 'memory/max_allocated (GiB)': 89.41, 'memory/device_reserved (GiB)': 90.5, 'tokens_per_second_per_gpu': 935.17, 'epoch': 1.33} | |
| 44%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 36/81 [14:56<18:53, 25.18s/it] 46%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 37/81 [15:21<18:23, 25.08s/it] {'loss': 2.048, 'grad_norm': 1.1598583459854126, 'learning_rate': 1.2169788081063178e-06, 'memory/max_active (GiB)': 76.99, 'memory/max_allocated (GiB)': 76.99, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 848.84, 'epoch': 1.37} | |
| 46%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 37/81 [15:21<18:23, 25.08s/it] 47%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 38/81 [15:47<18:07, 25.30s/it] {'loss': 1.4693, 'grad_norm': 1.03346848487854, 'learning_rate': 1.1779978956775504e-06, 'memory/max_active (GiB)': 81.33, 'memory/max_allocated (GiB)': 81.33, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 864.18, 'epoch': 1.41} | |
| 47%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 38/81 [15:47<18:07, 25.30s/it] 48%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 39/81 [16:09<17:02, 24.35s/it] {'loss': 0.629, 'grad_norm': 0.5300745964050293, 'learning_rate': 1.1387355319890683e-06, 'memory/max_active (GiB)': 38.13, 'memory/max_allocated (GiB)': 38.13, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 660.81, 'epoch': 1.45} | |
| 48%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 39/81 [16:09<17:02, 24.35s/it] 49%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 40/81 [16:35<17:04, 24.98s/it] {'loss': 1.354, 'grad_norm': 0.45447468757629395, 'learning_rate': 1.0992537989080618e-06, 'memory/max_active (GiB)': 68.52, 'memory/max_allocated (GiB)': 68.52, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 953.46, 'epoch': 1.48} | |
| 49%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 40/81 [16:35<17:04, 24.98s/it] 51%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 41/81 [17:00<16:39, 25.00s/it] {'loss': 0.7253, 'grad_norm': 0.3777391314506531, 'learning_rate': 1.0596151251698198e-06, 'memory/max_active (GiB)': 86.09, 'memory/max_allocated (GiB)': 86.09, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 994.9, 'epoch': 1.52} | |
| 51%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 41/81 [17:00<16:39, 25.00s/it] 52%|ââââââââââââââââââââââââââââââ | 42/81 [17:25<16:14, 25.00s/it] {'loss': 0.5797, 'grad_norm': 0.4444688856601715, 'learning_rate': 1.01988218766507e-06, 'memory/max_active (GiB)': 67.08, 'memory/max_allocated (GiB)': 67.08, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 768.21, 'epoch': 1.56} | |
| 52%|ââââââââââââââââââââââââââââââ | 42/81 [17:25<16:14, 25.00s/it] 53%|âââââââââââââââââââââââââââââââ | 43/81 [17:52<16:13, 25.61s/it] {'loss': 1.7366, 'grad_norm': 0.8071985244750977, 'learning_rate': 9.801178123349297e-07, 'memory/max_active (GiB)': 84.78, 'memory/max_allocated (GiB)': 84.78, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 1054.07, 'epoch': 1.59} | |
| 53%|âââââââââââââââââââââââââââââââ | 43/81 [17:52<16:13, 25.61s/it] 54%|ââââââââââââââââââââââââââââââââ | 44/81 [18:17<15:33, 25.23s/it] {'loss': 0.8716, 'grad_norm': 0.7125491499900818, 'learning_rate': 9.403848748301802e-07, 'memory/max_active (GiB)': 62.62, 'memory/max_allocated (GiB)': 62.62, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 829.41, 'epoch': 1.63} | |
| 54%|ââââââââââââââââââââââââââââââââ | 44/81 [18:17<15:33, 25.23s/it] 56%|âââââââââââââââââââââââââââââââââ | 45/81 [18:43<15:15, 25.42s/it] {'loss': 3.5353, 'grad_norm': 1.353916883468628, 'learning_rate': 9.007462010919385e-07, 'memory/max_active (GiB)': 69.41, 'memory/max_allocated (GiB)': 69.41, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 878.82, 'epoch': 1.67} | |
| 56%|âââââââââââââââââââââââââââââââââ | 45/81 [18:43<15:15, 25.42s/it] 57%|âââââââââââââââââââââââââââââââââ | 46/81 [19:06<14:29, 24.85s/it] {'loss': 1.3489, 'grad_norm': 1.1499241590499878, 'learning_rate': 8.612644680109318e-07, 'memory/max_active (GiB)': 79.0, 'memory/max_allocated (GiB)': 79.0, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 868.24, 'epoch': 1.71} | |
| 57%|âââââââââââââââââââââââââââââââââ | 46/81 [19:06<14:29, 24.85s/it] 58%|ââââââââââââââââââââââââââââââââââ | 47/81 [19:31<14:08, 24.96s/it] {'loss': 1.543, 'grad_norm': 1.5999720096588135, 'learning_rate': 8.220021043224499e-07, 'memory/max_active (GiB)': 61.32, 'memory/max_allocated (GiB)': 61.32, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 935.5, 'epoch': 1.74} | |
| 58%|ââââââââââââââââââââââââââââââââââ | 47/81 [19:31<14:08, 24.96s/it] 59%|âââââââââââââââââââââââââââââââââââ | 48/81 [19:56<13:39, 24.84s/it] {'loss': 1.2282, 'grad_norm': 0.930808961391449, 'learning_rate': 7.830211918936819e-07, 'memory/max_active (GiB)': 44.35, 'memory/max_allocated (GiB)': 44.35, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 827.33, 'epoch': 1.78} | |
| 59%|âââââââââââââââââââââââââââââââââââ | 48/81 [19:56<13:39, 24.84s/it] 60%|âââââââââââââââââââââââââââââââââââ | 49/81 [20:19<12:56, 24.27s/it] {'loss': 3.0884, 'grad_norm': 1.004947543144226, 'learning_rate': 7.443833675595253e-07, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 782.62, 'epoch': 1.82} | |
| 60%|âââââââââââââââââââââââââââââââââââ | 49/81 [20:19<12:56, 24.27s/it] 62%|ââââââââââââââââââââââââââââââââââââ | 50/81 [20:44<12:37, 24.44s/it] {'loss': 1.0819, 'grad_norm': 0.8130286335945129, 'learning_rate': 7.061497256620792e-07, 'memory/max_active (GiB)': 49.75, 'memory/max_allocated (GiB)': 49.75, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 915.1, 'epoch': 1.85} | |
| 62%|ââââââââââââââââââââââââââââââââââââ | 50/81 [20:44<12:37, 24.44s/it] 63%|âââââââââââââââââââââââââââââââââââââ | 51/81 [21:11<12:40, 25.36s/it] {'loss': 1.6019, 'grad_norm': 0.8784174919128418, 'learning_rate': 6.683807214479323e-07, 'memory/max_active (GiB)': 75.17, 'memory/max_allocated (GiB)': 75.17, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 988.63, 'epoch': 1.89} | |
| 63%|âââââââââââââââââââââââââââââââââââââ | 51/81 [21:11<12:40, 25.36s/it] 64%|ââââââââââââââââââââââââââââââââââââââ | 52/81 [21:38<12:30, 25.88s/it] {'loss': 1.3716, 'grad_norm': 1.0731853246688843, 'learning_rate': 6.311360754759923e-07, 'memory/max_active (GiB)': 88.84, 'memory/max_allocated (GiB)': 88.84, 'memory/device_reserved (GiB)': 90.42, 'tokens_per_second_per_gpu': 865.92, 'epoch': 1.93} | |
| 64%|ââââââââââââââââââââââââââââââââââââââ | 52/81 [21:38<12:30, 25.88s/it] 65%|ââââââââââââââââââââââââââââââââââââââ | 53/81 [22:03<11:54, 25.53s/it] {'loss': 0.4119, 'grad_norm': 0.340857595205307, 'learning_rate': 5.944746791870061e-07, 'memory/max_active (GiB)': 86.43, 'memory/max_allocated (GiB)': 86.43, 'memory/device_reserved (GiB)': 89.76, 'tokens_per_second_per_gpu': 873.98, 'epoch': 1.97} | |
| 65%|ââââââââââââââââââââââââââââââââââââââ | 53/81 [22:03<11:54, 25.53s/it] 67%|âââââââââââââââââââââââââââââââââââââââ | 54/81 [22:25<11:04, 24.60s/it] {'loss': 0.7065, 'grad_norm': 0.48797884583473206, 'learning_rate': 5.584545017840885e-07, 'memory/max_active (GiB)': 86.08, 'memory/max_allocated (GiB)': 86.08, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 844.64, 'epoch': 2.0} | |
| 67%|âââââââââââââââââââââââââââââââââââââââ | 54/81 [22:25<11:04, 24.60s/it][2025-12-27 21:41:29,979] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-54 | |
| [2025-12-27 21:41:32,641] [WARNING] [py.warnings._showwarnmsg:110] [PID:122677] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization | |
| warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") | |
| 68%|ââââââââââââââââââââââââââââââââââââââââ | 55/81 [22:54<11:09, 25.76s/it] {'loss': 2.1499, 'grad_norm': 1.208370566368103, 'learning_rate': 5.231324985714941e-07, 'memory/max_active (GiB)': 81.46, 'memory/max_allocated (GiB)': 81.46, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 850.71, 'epoch': 2.04} | |
| 68%|ââââââââââââââââââââââââââââââââââââââââ | 55/81 [22:54<11:09, 25.76s/it] 69%|ââââââââââââââââââââââââââââââââââââââââ | 56/81 [23:17<10:24, 24.99s/it] {'loss': 0.8085, 'grad_norm': 0.6215103268623352, 'learning_rate': 4.885645208965778e-07, 'memory/max_active (GiB)': 45.1, 'memory/max_allocated (GiB)': 45.1, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 825.2, 'epoch': 2.07} | |
| 69%|ââââââââââââââââââââââââââââââââââââââââ | 56/81 [23:17<10:24, 24.99s/it] 70%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 57/81 [23:42<10:00, 25.02s/it] {'loss': 1.8707, 'grad_norm': 1.5340229272842407, 'learning_rate': 4.5480522783733265e-07, 'memory/max_active (GiB)': 74.01, 'memory/max_allocated (GiB)': 74.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 974.6, 'epoch': 2.11} | |
| 70%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 57/81 [23:42<10:00, 25.02s/it] 72%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 58/81 [24:06<09:27, 24.66s/it] {'loss': 1.289, 'grad_norm': 0.5400649309158325, 'learning_rate': 4.2190799977515145e-07, 'memory/max_active (GiB)': 68.43, 'memory/max_allocated (GiB)': 68.43, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 871.48, 'epoch': 2.15} | |
| 72%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 58/81 [24:06<09:27, 24.66s/it] 73%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 59/81 [24:32<09:11, 25.05s/it] {'loss': 0.8458, 'grad_norm': 0.4076080322265625, 'learning_rate': 3.8992485398947563e-07, 'memory/max_active (GiB)': 62.83, 'memory/max_allocated (GiB)': 62.83, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 1008.98, 'epoch': 2.19} | |
| 73%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 59/81 [24:32<09:11, 25.05s/it] 74%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 60/81 [24:56<08:39, 24.73s/it] {'loss': 1.7388, 'grad_norm': 0.8173409700393677, 'learning_rate': 3.5890636240778015e-07, 'memory/max_active (GiB)': 51.13, 'memory/max_allocated (GiB)': 51.13, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 880.54, 'epoch': 2.22} | |
| 74%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 60/81 [24:56<08:39, 24.73s/it] 75%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 61/81 [25:19<08:04, 24.21s/it] {'loss': 0.7868, 'grad_norm': 0.6387739181518555, 'learning_rate': 3.289015716409631e-07, 'memory/max_active (GiB)': 41.57, 'memory/max_allocated (GiB)': 41.57, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 740.08, 'epoch': 2.26} | |
| 75%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 61/81 [25:19<08:04, 24.21s/it] 77%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 62/81 [25:45<07:49, 24.70s/it] {'loss': 0.8116, 'grad_norm': 0.691184401512146, 'learning_rate': 2.9995792543057473e-07, 'memory/max_active (GiB)': 71.73, 'memory/max_allocated (GiB)': 71.73, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 889.11, 'epoch': 2.3} | |
| 77%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 62/81 [25:45<07:49, 24.70s/it] 78%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 63/81 [26:09<07:22, 24.58s/it] {'loss': 0.6392, 'grad_norm': 0.492448627948761, 'learning_rate': 2.721211896305059e-07, 'memory/max_active (GiB)': 66.99, 'memory/max_allocated (GiB)': 66.99, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 779.69, 'epoch': 2.33} | |
| 78%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 63/81 [26:09<07:22, 24.58s/it] 79%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 64/81 [26:33<06:56, 24.47s/it] {'loss': 0.7128, 'grad_norm': 0.5834090709686279, 'learning_rate': 2.454353798417698e-07, 'memory/max_active (GiB)': 79.01, 'memory/max_allocated (GiB)': 79.01, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 821.37, 'epoch': 2.37} | |
| 79%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 64/81 [26:33<06:56, 24.47s/it] 80%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 65/81 [26:56<06:24, 24.04s/it] {'loss': 0.8861, 'grad_norm': 0.6599698662757874, 'learning_rate': 2.1994269181478798e-07, 'memory/max_active (GiB)': 47.26, 'memory/max_allocated (GiB)': 47.26, 'memory/device_reserved (GiB)': 89.97, 'tokens_per_second_per_gpu': 811.24, 'epoch': 2.41} | |
| 80%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 65/81 [26:56<06:24, 24.04s/it] 81%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 66/81 [27:23<06:10, 24.71s/it] {'loss': 2.1615, 'grad_norm': 0.7803909778594971, 'learning_rate': 1.956834347292352e-07, 'memory/max_active (GiB)': 89.03, 'memory/max_allocated (GiB)': 89.03, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 882.58, 'epoch': 2.45} | |
| 81%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 66/81 [27:23<06:10, 24.71s/it] 83%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 67/81 [27:49<05:51, 25.11s/it] {'loss': 0.9951, 'grad_norm': 0.558167576789856, 'learning_rate': 1.7269596745694292e-07, 'memory/max_active (GiB)': 89.47, 'memory/max_allocated (GiB)': 89.47, 'memory/device_reserved (GiB)': 90.54, 'tokens_per_second_per_gpu': 948.51, 'epoch': 2.48} | |
| 83%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 67/81 [27:49<05:51, 25.11s/it] 84%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 68/81 [28:16<05:34, 25.73s/it] {'loss': 2.4148, 'grad_norm': 1.3278695344924927, 'learning_rate': 1.5101663790863595e-07, 'memory/max_active (GiB)': 89.46, 'memory/max_allocated (GiB)': 89.46, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.43, 'epoch': 2.52} | |
| 84%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 68/81 [28:16<05:34, 25.73s/it] 85%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 69/81 [28:40<05:01, 25.12s/it] {'loss': 1.4344, 'grad_norm': 1.1428226232528687, 'learning_rate': 1.306797255604175e-07, 'memory/max_active (GiB)': 51.65, 'memory/max_allocated (GiB)': 51.65, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 852.22, 'epoch': 2.56} | |
| 85%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 69/81 [28:40<05:01, 25.12s/it] 86%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 70/81 [29:03<04:30, 24.60s/it] {'loss': 1.6188, 'grad_norm': 1.5298364162445068, 'learning_rate': 1.1171738725086832e-07, 'memory/max_active (GiB)': 68.56, 'memory/max_allocated (GiB)': 68.56, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 861.71, 'epoch': 2.59} | |
| 86%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 70/81 [29:03<04:30, 24.60s/it] 88%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 71/81 [29:29<04:09, 24.94s/it] {'loss': 0.6562, 'grad_norm': 0.5825881361961365, 'learning_rate': 9.415960633447673e-08, 'memory/max_active (GiB)': 86.4, 'memory/max_allocated (GiB)': 86.4, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 991.35, 'epoch': 2.63} | |
| 88%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 71/81 [29:29<04:09, 24.94s/it] 89%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 72/81 [29:52<03:40, 24.49s/it] {'loss': 3.4467, 'grad_norm': 1.508130669593811, 'learning_rate': 7.803414527179342e-08, 'memory/max_active (GiB)': 56.32, 'memory/max_allocated (GiB)': 56.32, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 757.41, 'epoch': 2.67} | |
| 89%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 72/81 [29:52<03:40, 24.49s/it] 90%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 73/81 [30:17<03:15, 24.48s/it] {'loss': 3.2649, 'grad_norm': 1.0750031471252441, 'learning_rate': 6.336650173127223e-08, 'memory/max_active (GiB)': 56.28, 'memory/max_allocated (GiB)': 56.28, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 793.48, 'epoch': 2.71} | |
| 90%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 73/81 [30:17<03:15, 24.48s/it] 91%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 74/81 [30:40<02:48, 24.07s/it] {'loss': 2.3341, 'grad_norm': 2.035534381866455, 'learning_rate': 5.017986827221732e-08, 'memory/max_active (GiB)': 58.54, 'memory/max_allocated (GiB)': 58.54, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 766.33, 'epoch': 2.74} | |
| 91%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 74/81 [30:40<02:48, 24.07s/it] 93%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 75/81 [31:04<02:24, 24.10s/it] {'loss': 1.6347, 'grad_norm': 1.792739748954773, 'learning_rate': 3.849509567257958e-08, 'memory/max_active (GiB)': 69.34, 'memory/max_allocated (GiB)': 69.34, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 786.7, 'epoch': 2.78} | |
| 93%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 75/81 [31:04<02:24, 24.10s/it] 94%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 76/81 [31:32<02:06, 25.35s/it] {'loss': 1.4495, 'grad_norm': 1.2769355773925781, 'learning_rate': 2.8330659959589942e-08, 'memory/max_active (GiB)': 86.1, 'memory/max_allocated (GiB)': 86.1, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1065.84, 'epoch': 2.82} | |
| 94%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 76/81 [31:32<02:06, 25.35s/it] 95%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 77/81 [31:56<01:39, 24.82s/it] {'loss': 2.2957, 'grad_norm': 1.664533257484436, 'learning_rate': 1.9702633195363917e-08, 'memory/max_active (GiB)': 60.57, 'memory/max_allocated (GiB)': 60.57, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 865.29, 'epoch': 2.85} | |
| 95%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 77/81 [31:56<01:39, 24.82s/it] 96%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 78/81 [32:24<01:17, 25.73s/it] {'loss': 1.8847, 'grad_norm': 1.805460810661316, 'learning_rate': 1.2624658063666638e-08, 'memory/max_active (GiB)': 84.93, 'memory/max_allocated (GiB)': 84.93, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 789.11, 'epoch': 2.89} | |
| 96%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 78/81 [32:24<01:17, 25.73s/it] 98%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 79/81 [32:49<00:51, 25.71s/it] {'loss': 0.6906, 'grad_norm': 0.49316051602363586, 'learning_rate': 7.10792629802659e-09, 'memory/max_active (GiB)': 79.04, 'memory/max_allocated (GiB)': 79.04, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 1009.91, 'epoch': 2.93} | |
| 98%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 79/81 [32:49<00:51, 25.71s/it] 99%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 80/81 [33:16<00:26, 26.12s/it] {'loss': 1.9236, 'grad_norm': 1.247730016708374, 'learning_rate': 3.1611609853041676e-09, 'memory/max_active (GiB)': 81.45, 'memory/max_allocated (GiB)': 81.45, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 944.97, 'epoch': 2.97} | |
| 99%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ | 80/81 [33:16<00:26, 26.12s/it] 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 81/81 [33:40<00:00, 25.25s/it] {'loss': 0.9602, 'grad_norm': 0.6849121451377869, 'learning_rate': 7.906027726981567e-10, 'memory/max_active (GiB)': 76.78, 'memory/max_allocated (GiB)': 76.78, 'memory/device_reserved (GiB)': 90.68, 'tokens_per_second_per_gpu': 870.01, 'epoch': 3.0} | |
| 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 81/81 [33:40<00:00, 25.25s/it][2025-12-27 21:52:44,059] [INFO] [axolotl.core.trainers.base._save:671] [PID:122677] Saving model checkpoint to ./outputs/mymodel/checkpoint-81 | |
| {'train_runtime': 2021.9142, 'train_samples_per_second': 5.128, 'train_steps_per_second': 0.04, 'train_loss': 1.5273678048893258, 'memory/max_active (GiB)': 4.6, 'memory/max_allocated (GiB)': 4.6, 'memory/device_reserved (GiB)': 90.68, 'epoch': 3.0} | |
| 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 81/81 [33:41<00:00, 25.25s/it] 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 81/81 [33:41<00:00, 24.96s/it] | |
| [2025-12-27 21:52:46,295] [INFO] [axolotl.train.save_trained_model:218] [PID:122677] Training completed! Saving trained model to ./outputs/mymodel. | |
| [2025-12-27 21:52:47,133] [INFO] [axolotl.train.save_trained_model:336] [PID:122677] Model successfully saved to ./outputs/mymodel | |