| [2025-10-25 17:49:53,747] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:4001] bf16 support detected, enabling for this configuration. | |
| [2025-10-25 17:49:53,988] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:4001] baseline 0.000GB () | |
| [2025-10-25 17:49:53,988] [INFO] [axolotl.cli.config.load_cfg:248] [PID:4001] config: | |
| { | |
| "activation_offloading": true, | |
| "axolotl_config_path": "train.yml", | |
| "base_model": "Qwen/Qwen3-4B-Instruct-2507", | |
| "base_model_config": "Qwen/Qwen3-4B-Instruct-2507", | |
| "batch_size": 4, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_86", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "chat_template": "tokenizer_default", | |
| "context_parallel_size": 1, | |
| "cosine_min_lr_ratio": 0.1, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_num_proc": 16, | |
| "dataset_prepared_path": "last_run_prepared", | |
| "datasets": [ | |
| { | |
| "chat_template": "tokenizer_default", | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "WokeAI/polititune-tankie-warmup", | |
| "split": "train", | |
| "trust_remote_code": false, | |
| "type": "chat_template" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.8.0" | |
| }, | |
| "eval_batch_size": 1, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_sample_packing": true, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "flash_attention": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 4, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": true | |
| }, | |
| "group_by_length": false, | |
| "include_tkps": true, | |
| "learning_rate": 1e-05, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "logging_steps": 1, | |
| "lora_dropout": 0.0, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "constant", | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 1, | |
| "model_config_type": "qwen3", | |
| "num_epochs": 2.0, | |
| "optimizer": "paged_ademamix_8bit", | |
| "otel_metrics_host": "localhost", | |
| "otel_metrics_port": 8000, | |
| "output_dir": "./model-output", | |
| "pad_to_sequence_len": true, | |
| "pretrain_multipack_attn": true, | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing": true, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": true, | |
| "save_safetensors": true, | |
| "save_steps": 0.25, | |
| "saves_per_epoch": 2, | |
| "sequence_len": 2048, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "special_tokens": { | |
| "eos_token": "<|im_end|>" | |
| }, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "trust_remote_code": true, | |
| "use_otel_metrics": false, | |
| "use_ray": false, | |
| "use_wandb": true, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "wandb_project": "polititune-q34b-warmup", | |
| "warmup_ratio": 0.05, | |
| "weight_decay": 0.01, | |
| "world_size": 1 | |
| } | |
| [2025-10-25 17:49:53,990] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:4001] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets. | |
| [2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <|im_end|> | |
| [2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None | |
| [2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <|endoftext|> | |
| [2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None | |
| [2025-10-25 17:49:54,858] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:4001] Loading prepared dataset from disk at last_run_prepared/a9098d9a4841d51fd558499bade3d148... | |
| [2025-10-25 17:49:54,863] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:4001] total_num_tokens: 88_397 | |
| [2025-10-25 17:49:54,864] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:4001] `total_supervised_tokens: 81_792` | |
| [2025-10-25 17:49:54,866] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially. | |
| [2025-10-25 17:49:55,435] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially. | |
| [2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.151627779006958 | |
| [2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially. | |
| [2025-10-25 17:49:55,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.14948749542236328 | |
| [2025-10-25 17:49:55,737] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially. | |
| [2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.15515494346618652 | |
| [2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially. | |
| [2025-10-25 17:49:56,073] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.18137788772583008 | |
| [2025-10-25 17:49:56,094] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46] | |
| [2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:4001] data_loader_len: 11 | |
| [2025-10-25 17:49:56,094] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:4001] sample_packing_eff_est across ranks: [0.9383173403532609] | |
| [2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:4001] sample_packing_eff_est: 0.94 | |
| [2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:4001] total_num_steps: 22 | |
| [2025-10-25 17:49:56,094] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:4001] Maximum number of steps set at 22 | |
| [2025-10-25 17:49:56,115] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:4001] Loading tokenizer... Qwen/Qwen3-4B-Instruct-2507 | |
| [2025-10-25 17:49:56,797] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <|im_end|> | |
| [2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None | |
| [2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <|endoftext|> | |
| [2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None | |
| [2025-10-25 17:49:56,798] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:4001] Loading model | |
| [2025-10-25 17:49:57,139] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:4001] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2025-10-25 17:49:57,140] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:4001] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2025-10-25 17:49:57,140] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:4001] Applying multipack dataloader patch for sample packing... | |
| Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββ| 3/3 [00:00<00:00, 78.33it/s] | |
| [2025-10-25 17:49:58,776] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:4001] Converting modules to torch.bfloat16 | |
| [2025-10-25 17:49:59,230] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:4001] Memory usage after model load 0.000GB () | |
| [2025-10-25 17:50:01,453] [INFO] [axolotl.train.save_initial_configs:402] [PID:4001] Pre-saving tokenizer to ./model-output... | |
| [2025-10-25 17:50:01,532] [INFO] [axolotl.train.save_initial_configs:407] [PID:4001] Pre-saving model config to ./model-output... | |
| [2025-10-25 17:50:01,534] [INFO] [axolotl.train.execute_training:196] [PID:4001] Starting trainer... | |
| [2025-10-25 17:50:02,423] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32518529891967773 | |
| [2025-10-25 17:50:02,751] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3276631832122803 | |
| [2025-10-25 17:50:03,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3279576301574707 | |
| [2025-10-25 17:50:03,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32717275619506836 | |
| [2025-10-25 17:50:03,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46] | |
| [34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin | |
| [34m[1mwandb[0m: [38;5;178mβ’Ώ[0m Waiting for wandb.init()... | |
| [Am[2K [34m[1mwandb[0m: [38;5;178mβ£»[0m setting up run f79oi2ub (0.1s) | |
| [Am[2K [34m[1mwandb[0m: [38;5;178mβ£½[0m setting up run f79oi2ub (0.1s) | |
| [Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.22.2 | |
| [34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20251025_175003-f79oi2ub[0m | |
| [34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing. | |
| [34m[1mwandb[0m: Syncing run [33mquiet-snowflake-2[0m | |
| [34m[1mwandb[0m: βοΈ View project at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup[0m | |
| [34m[1mwandb[0m: π View run at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup/runs/f79oi2ub[0m | |
| [34m[1mwandb[0m: Detected [huggingface_hub.inference] in use. | |
| [34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. | |
| [34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/ | |
| [34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") | |
| [2025-10-25 17:50:05,900] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:4001] The Axolotl config has been saved to the WandB run under files. | |
| 0%| | 0/22 [00:00<?, ?it/s] 5%|βββ | 1/22 [00:17<06:07, 17.49s/it] {'loss': 3.3053, 'grad_norm': 24.125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.49, 'memory/max_allocated (GiB)': 18.49, 'memory/device_reserved (GiB)': 21.18, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.09} | |
| 5%|βββ | 1/22 [00:17<06:07, 17.49s/it] 9%|ββββββ | 2/22 [00:27<04:24, 13.21s/it] {'loss': 2.9641, 'grad_norm': 12.0, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 704.32, 'epoch': 0.17} | |
| 9%|ββββββ | 2/22 [00:27<04:24, 13.21s/it] 14%|βββββββββ | 3/22 [00:37<03:43, 11.76s/it] {'loss': 2.8185, 'grad_norm': 8.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 731.68, 'epoch': 0.26} | |
| 14%|βββββββββ | 3/22 [00:37<03:43, 11.76s/it] 18%|ββββββββββββ | 4/22 [00:47<03:19, 11.08s/it] {'loss': 2.8666, 'grad_norm': 6.46875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 728.47, 'epoch': 0.35} | |
| 18%|ββββββββββββ | 4/22 [00:47<03:19, 11.08s/it] 23%|βββββββββββββββ | 5/22 [00:57<03:02, 10.71s/it] {'loss': 2.7515, 'grad_norm': 5.875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 716.16, 'epoch': 0.43} | |
| 23%|βββββββββββββββ | 5/22 [00:57<03:02, 10.71s/it] 27%|ββββββββββββββββββ | 6/22 [01:08<02:49, 10.62s/it] {'loss': 2.7633, 'grad_norm': 5.53125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 681.72, 'epoch': 0.52} | |
| 27%|ββββββββββββββββββ | 6/22 [01:08<02:49, 10.62s/it][2025-10-25 17:51:14,180] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-6 | |
| 32%|βββββββββββββββββββββ | 7/22 [01:28<03:27, 13.81s/it] {'loss': 2.6924, 'grad_norm': 5.59375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 699.03, 'epoch': 0.61} | |
| 32%|βββββββββββββββββββββ | 7/22 [01:28<03:27, 13.81s/it] 36%|ββββββββββββββββββββββββ | 8/22 [01:38<02:56, 12.62s/it] {'loss': 2.7207, 'grad_norm': 5.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 740.89, 'epoch': 0.7} | |
| 36%|ββββββββββββββββββββββββ | 8/22 [01:38<02:56, 12.62s/it] 41%|βββββββββββββββββββββββββββ | 9/22 [01:48<02:33, 11.82s/it] {'loss': 2.6885, 'grad_norm': 4.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 718.9, 'epoch': 0.78} | |
| 41%|βββββββββββββββββββββββββββ | 9/22 [01:48<02:33, 11.82s/it] 45%|βββββββββββββββββββββββββββββ | 10/22 [01:58<02:15, 11.29s/it] {'loss': 2.6238, 'grad_norm': 4.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 730.17, 'epoch': 0.87} | |
| 45%|βββββββββββββββββββββββββββββ | 10/22 [01:58<02:15, 11.29s/it] 50%|ββββββββββββββββββββββββββββββββ | 11/22 [02:08<02:00, 10.92s/it] {'loss': 2.6501, 'grad_norm': 4.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 702.7, 'epoch': 0.96} | |
| 50%|ββββββββββββββββββββββββββββββββ | 11/22 [02:08<02:00, 10.92s/it] 55%|βββββββββββββββββββββββββββββββββββ | 12/22 [02:16<01:37, 9.78s/it] {'loss': 2.7149, 'grad_norm': 6.96875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 286.86, 'epoch': 1.0} | |
| 55%|βββββββββββββββββββββββββββββββββββ | 12/22 [02:16<01:37, 9.78s/it][2025-10-25 17:52:22,046] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-12 | |
| 59%|ββββββββββββββββββββββββββββββββββββββ | 13/22 [02:38<02:02, 13.56s/it] {'loss': 2.5872, 'grad_norm': 4.375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 662.73, 'epoch': 1.09} | |
| 59%|ββββββββββββββββββββββββββββββββββββββ | 13/22 [02:38<02:02, 13.56s/it] 64%|βββββββββββββββββββββββββββββββββββββββββ | 14/22 [02:48<01:40, 12.52s/it] {'loss': 2.532, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 737.0, 'epoch': 1.17} | |
| 64%|βββββββββββββββββββββββββββββββββββββββββ | 14/22 [02:48<01:40, 12.52s/it] 68%|ββββββββββββββββββββββββββββββββββββββββββββ | 15/22 [02:58<01:22, 11.78s/it] {'loss': 2.4174, 'grad_norm': 3.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.14, 'epoch': 1.26} | |
| 68%|ββββββββββββββββββββββββββββββββββββββββββββ | 15/22 [02:58<01:22, 11.78s/it] 73%|βββββββββββββββββββββββββββββββββββββββββββββββ | 16/22 [03:08<01:07, 11.28s/it] {'loss': 2.4644, 'grad_norm': 4.09375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.69, 'epoch': 1.35} | |
| 73%|βββββββββββββββββββββββββββββββββββββββββββββββ | 16/22 [03:08<01:07, 11.28s/it] 77%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 17/22 [03:18<00:54, 10.94s/it] {'loss': 2.5299, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 713.56, 'epoch': 1.43} | |
| 77%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 17/22 [03:18<00:54, 10.94s/it] 82%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 18/22 [03:28<00:42, 10.70s/it] {'loss': 2.4902, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 674.14, 'epoch': 1.52} | |
| 82%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 18/22 [03:28<00:42, 10.70s/it][2025-10-25 17:53:34,885] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-18 | |
| 86%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 19/22 [03:48<00:40, 13.41s/it] {'loss': 2.4657, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 655.58, 'epoch': 1.61} | |
| 86%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 19/22 [03:48<00:40, 13.41s/it] 91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 20/22 [03:58<00:24, 12.42s/it] {'loss': 2.4085, 'grad_norm': 4.0625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 709.39, 'epoch': 1.7} | |
| 91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 20/22 [03:58<00:24, 12.42s/it] 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 21/22 [04:08<00:11, 11.73s/it] {'loss': 2.3577, 'grad_norm': 3.796875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 691.03, 'epoch': 1.78} | |
| 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 21/22 [04:08<00:11, 11.73s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 22/22 [04:19<00:00, 11.26s/it] {'loss': 2.3456, 'grad_norm': 3.953125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 707.03, 'epoch': 1.87} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 22/22 [04:19<00:00, 11.26s/it][2025-10-25 17:54:25,016] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-22 | |
| {'train_runtime': 271.0135, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.081, 'train_loss': 2.643556302244013, 'memory/max_active (GiB)': 7.67, 'memory/max_allocated (GiB)': 7.67, 'memory/device_reserved (GiB)': 21.19, 'epoch': 1.87} | |
| 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 22/22 [04:28<00:00, 11.26s/it] 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 22/22 [04:28<00:00, 12.21s/it] | |
| [2025-10-25 17:54:34,884] [INFO] [axolotl.train.save_trained_model:218] [PID:4001] Training completed! Saving trained model to ./model-output. | |
| [2025-10-25 17:54:44,496] [INFO] [axolotl.train.save_trained_model:336] [PID:4001] Model successfully saved to ./model-output | |
| [0m |