Upload folder using huggingface_hub

1f5ec7c verified 3 months ago

28.1 kB

	[2025-10-25 17:49:53,747] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:4001] bf16 support detected, enabling for this configuration.
	[2025-10-25 17:49:53,988] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:4001] baseline 0.000GB ()
	[2025-10-25 17:49:53,988] [INFO] [axolotl.cli.config.load_cfg:248] [PID:4001] config:
	{
	"activation_offloading": true,
	"axolotl_config_path": "train.yml",
	"base_model": "Qwen/Qwen3-4B-Instruct-2507",
	"base_model_config": "Qwen/Qwen3-4B-Instruct-2507",
	"batch_size": 4,
	"bf16": true,
	"capabilities": {
	"bf16": true,
	"compute_capability": "sm_86",
	"fp8": false,
	"n_gpu": 1,
	"n_node": 1
	},
	"chat_template": "tokenizer_default",
	"context_parallel_size": 1,
	"cosine_min_lr_ratio": 0.1,
	"dataloader_num_workers": 1,
	"dataloader_pin_memory": true,
	"dataloader_prefetch_factor": 256,
	"dataset_num_proc": 16,
	"dataset_prepared_path": "last_run_prepared",
	"datasets": [
	{
	"chat_template": "tokenizer_default",
	"message_property_mappings": {
	"content": "content",
	"role": "role"
	},
	"path": "WokeAI/polititune-tankie-warmup",
	"split": "train",
	"trust_remote_code": false,
	"type": "chat_template"
	}
	],
	"ddp": false,
	"device": "cuda:0",
	"dion_rank_fraction": 1.0,
	"dion_rank_multiple_of": 1,
	"env_capabilities": {
	"torch_version": "2.8.0"
	},
	"eval_batch_size": 1,
	"eval_causal_lm_metrics": [
	"sacrebleu",
	"comet",
	"ter",
	"chrf"
	],
	"eval_max_new_tokens": 128,
	"eval_sample_packing": true,
	"eval_table_size": 0,
	"experimental_skip_move_to_device": true,
	"flash_attention": true,
	"fp16": false,
	"gradient_accumulation_steps": 4,
	"gradient_checkpointing": true,
	"gradient_checkpointing_kwargs": {
	"use_reentrant": true
	},
	"group_by_length": false,
	"include_tkps": true,
	"learning_rate": 1e-05,
	"lisa_layers_attribute": "model.layers",
	"load_best_model_at_end": false,
	"load_in_4bit": false,
	"load_in_8bit": false,
	"local_rank": 0,
	"logging_steps": 1,
	"lora_dropout": 0.0,
	"loraplus_lr_embedding": 1e-06,
	"lr_scheduler": "constant",
	"mean_resizing_embeddings": false,
	"micro_batch_size": 1,
	"model_config_type": "qwen3",
	"num_epochs": 2.0,
	"optimizer": "paged_ademamix_8bit",
	"otel_metrics_host": "localhost",
	"otel_metrics_port": 8000,
	"output_dir": "./model-output",
	"pad_to_sequence_len": true,
	"pretrain_multipack_attn": true,
	"profiler_steps_start": 0,
	"qlora_sharded_model_loading": false,
	"ray_num_workers": 1,
	"resources_per_worker": {
	"GPU": 1
	},
	"sample_packing": true,
	"sample_packing_bin_size": 200,
	"sample_packing_group_size": 100000,
	"save_only_model": true,
	"save_safetensors": true,
	"save_steps": 0.25,
	"saves_per_epoch": 2,
	"sequence_len": 2048,
	"shuffle_before_merging_datasets": false,
	"shuffle_merged_datasets": true,
	"skip_prepare_dataset": false,
	"special_tokens": {
	"eos_token": "<\|im_end\|>"
	},
	"streaming_multipack_buffer_size": 10000,
	"strict": false,
	"tensor_parallel_size": 1,
	"tiled_mlp_use_original_mlp": true,
	"tokenizer_config": "Qwen/Qwen3-4B-Instruct-2507",
	"tokenizer_save_jinja_files": true,
	"torch_dtype": "torch.bfloat16",
	"train_on_inputs": false,
	"trl": {
	"log_completions": false,
	"mask_truncated_completions": false,
	"ref_model_mixup_alpha": 0.9,
	"ref_model_sync_steps": 64,
	"scale_rewards": true,
	"sync_ref_model": false,
	"use_vllm": false,
	"vllm_server_host": "0.0.0.0",
	"vllm_server_port": 8000
	},
	"trust_remote_code": true,
	"use_otel_metrics": false,
	"use_ray": false,
	"use_wandb": true,
	"val_set_size": 0.0,
	"vllm": {
	"device": "auto",
	"dtype": "auto",
	"gpu_memory_utilization": 0.9,
	"host": "0.0.0.0",
	"port": 8000
	},
	"wandb_project": "polititune-q34b-warmup",
	"warmup_ratio": 0.05,
	"weight_decay": 0.01,
	"world_size": 1
	}
	[2025-10-25 17:49:53,990] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:4001] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
	[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <\|im_end\|>
	[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None
	[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <\|endoftext\|>
	[2025-10-25 17:49:54,857] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None
	[2025-10-25 17:49:54,858] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:4001] Loading prepared dataset from disk at last_run_prepared/a9098d9a4841d51fd558499bade3d148...
	[2025-10-25 17:49:54,863] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:4001] total_num_tokens: 88_397
	[2025-10-25 17:49:54,864] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:4001] `total_supervised_tokens: 81_792`
	[2025-10-25 17:49:54,866] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
	[2025-10-25 17:49:55,435] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
	[2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.151627779006958
	[2025-10-25 17:49:55,587] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
	[2025-10-25 17:49:55,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.14948749542236328
	[2025-10-25 17:49:55,737] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
	[2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.15515494346618652
	[2025-10-25 17:49:55,892] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4001] Using single process for pack_parallel, running sequentially.
	[2025-10-25 17:49:56,073] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.18137788772583008
	[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46]
	[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:4001] data_loader_len: 11
	[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:4001] sample_packing_eff_est across ranks: [0.9383173403532609]
	[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:4001] sample_packing_eff_est: 0.94
	[2025-10-25 17:49:56,094] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:4001] total_num_steps: 22
	[2025-10-25 17:49:56,094] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:4001] Maximum number of steps set at 22
	[2025-10-25 17:49:56,115] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:4001] Loading tokenizer... Qwen/Qwen3-4B-Instruct-2507
	[2025-10-25 17:49:56,797] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:4001] EOS: 151645 / <\|im_end\|>
	[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:4001] BOS: None / None
	[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4001] PAD: 151643 / <\|endoftext\|>
	[2025-10-25 17:49:56,798] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4001] UNK: None / None
	[2025-10-25 17:49:56,798] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:4001] Loading model
	[2025-10-25 17:49:57,139] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:4001] Patched Trainer.evaluation_loop with nanmean loss calculation
	[2025-10-25 17:49:57,140] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:4001] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
	[2025-10-25 17:49:57,140] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:4001] Applying multipack dataloader patch for sample packing...
	Loading checkpoint shards: 0%\| \| 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 100%\|███████████████████████████████████████\| 3/3 [00:00<00:00, 78.33it/s]
	[2025-10-25 17:49:58,776] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:4001] Converting modules to torch.bfloat16
	[2025-10-25 17:49:59,230] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:4001] Memory usage after model load 0.000GB ()
	[2025-10-25 17:50:01,453] [INFO] [axolotl.train.save_initial_configs:402] [PID:4001] Pre-saving tokenizer to ./model-output...
	[2025-10-25 17:50:01,532] [INFO] [axolotl.train.save_initial_configs:407] [PID:4001] Pre-saving model config to ./model-output...
	[2025-10-25 17:50:01,534] [INFO] [axolotl.train.execute_training:196] [PID:4001] Starting trainer...
	[2025-10-25 17:50:02,423] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32518529891967773
	[2025-10-25 17:50:02,751] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3276631832122803
	[2025-10-25 17:50:03,079] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.3279576301574707
	[2025-10-25 17:50:03,406] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4001] generate_batches time: 0.32717275619506836
	[2025-10-25 17:50:03,407] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4001] gather_len_batches: [46]
	[34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
	[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
	[Am[2K [34m[1mwandb[0m: [38;5;178m⣻[0m setting up run f79oi2ub (0.1s)
	[Am[2K [34m[1mwandb[0m: [38;5;178m⣽[0m setting up run f79oi2ub (0.1s)
	[Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.22.2
	[34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20251025_175003-f79oi2ub[0m
	[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
	[34m[1mwandb[0m: Syncing run [33mquiet-snowflake-2[0m
	[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup[0m
	[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/fizzzz/polititune-q34b-warmup/runs/f79oi2ub[0m
	[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
	[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
	[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
	[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
	[2025-10-25 17:50:05,900] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:4001] The Axolotl config has been saved to the WandB run under files.
	0%\| \| 0/22 [00:00<?, ?it/s] 5%\|██▉ \| 1/22 [00:17<06:07, 17.49s/it] {'loss': 3.3053, 'grad_norm': 24.125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.49, 'memory/max_allocated (GiB)': 18.49, 'memory/device_reserved (GiB)': 21.18, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.09}
	5%\|██▉ \| 1/22 [00:17<06:07, 17.49s/it] 9%\|█████▉ \| 2/22 [00:27<04:24, 13.21s/it] {'loss': 2.9641, 'grad_norm': 12.0, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 704.32, 'epoch': 0.17}
	9%\|█████▉ \| 2/22 [00:27<04:24, 13.21s/it] 14%\|████████▊ \| 3/22 [00:37<03:43, 11.76s/it] {'loss': 2.8185, 'grad_norm': 8.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 731.68, 'epoch': 0.26}
	14%\|████████▊ \| 3/22 [00:37<03:43, 11.76s/it] 18%\|███████████▊ \| 4/22 [00:47<03:19, 11.08s/it] {'loss': 2.8666, 'grad_norm': 6.46875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 728.47, 'epoch': 0.35}
	18%\|███████████▊ \| 4/22 [00:47<03:19, 11.08s/it] 23%\|██████████████▊ \| 5/22 [00:57<03:02, 10.71s/it] {'loss': 2.7515, 'grad_norm': 5.875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 716.16, 'epoch': 0.43}
	23%\|██████████████▊ \| 5/22 [00:57<03:02, 10.71s/it] 27%\|█████████████████▋ \| 6/22 [01:08<02:49, 10.62s/it] {'loss': 2.7633, 'grad_norm': 5.53125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 681.72, 'epoch': 0.52}
	27%\|█████████████████▋ \| 6/22 [01:08<02:49, 10.62s/it][2025-10-25 17:51:14,180] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-6
	32%\|████████████████████▋ \| 7/22 [01:28<03:27, 13.81s/it] {'loss': 2.6924, 'grad_norm': 5.59375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 699.03, 'epoch': 0.61}
	32%\|████████████████████▋ \| 7/22 [01:28<03:27, 13.81s/it] 36%\|███████████████████████▋ \| 8/22 [01:38<02:56, 12.62s/it] {'loss': 2.7207, 'grad_norm': 5.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 740.89, 'epoch': 0.7}
	36%\|███████████████████████▋ \| 8/22 [01:38<02:56, 12.62s/it] 41%\|██████████████████████████▌ \| 9/22 [01:48<02:33, 11.82s/it] {'loss': 2.6885, 'grad_norm': 4.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 718.9, 'epoch': 0.78}
	41%\|██████████████████████████▌ \| 9/22 [01:48<02:33, 11.82s/it] 45%\|█████████████████████████████ \| 10/22 [01:58<02:15, 11.29s/it] {'loss': 2.6238, 'grad_norm': 4.625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 730.17, 'epoch': 0.87}
	45%\|█████████████████████████████ \| 10/22 [01:58<02:15, 11.29s/it] 50%\|████████████████████████████████ \| 11/22 [02:08<02:00, 10.92s/it] {'loss': 2.6501, 'grad_norm': 4.40625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 702.7, 'epoch': 0.96}
	50%\|████████████████████████████████ \| 11/22 [02:08<02:00, 10.92s/it] 55%\|██████████████████████████████████▉ \| 12/22 [02:16<01:37, 9.78s/it] {'loss': 2.7149, 'grad_norm': 6.96875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 286.86, 'epoch': 1.0}
	55%\|██████████████████████████████████▉ \| 12/22 [02:16<01:37, 9.78s/it][2025-10-25 17:52:22,046] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-12
	59%\|█████████████████████████████████████▊ \| 13/22 [02:38<02:02, 13.56s/it] {'loss': 2.5872, 'grad_norm': 4.375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 662.73, 'epoch': 1.09}
	59%\|█████████████████████████████████████▊ \| 13/22 [02:38<02:02, 13.56s/it] 64%\|████████████████████████████████████████▋ \| 14/22 [02:48<01:40, 12.52s/it] {'loss': 2.532, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 737.0, 'epoch': 1.17}
	64%\|████████████████████████████████████████▋ \| 14/22 [02:48<01:40, 12.52s/it] 68%\|███████████████████████████████████████████▋ \| 15/22 [02:58<01:22, 11.78s/it] {'loss': 2.4174, 'grad_norm': 3.9375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.14, 'epoch': 1.26}
	68%\|███████████████████████████████████████████▋ \| 15/22 [02:58<01:22, 11.78s/it] 73%\|██████████████████████████████████████████████▌ \| 16/22 [03:08<01:07, 11.28s/it] {'loss': 2.4644, 'grad_norm': 4.09375, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 712.69, 'epoch': 1.35}
	73%\|██████████████████████████████████████████████▌ \| 16/22 [03:08<01:07, 11.28s/it] 77%\|█████████████████████████████████████████████████▍ \| 17/22 [03:18<00:54, 10.94s/it] {'loss': 2.5299, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 713.56, 'epoch': 1.43}
	77%\|█████████████████████████████████████████████████▍ \| 17/22 [03:18<00:54, 10.94s/it] 82%\|████████████████████████████████████████████████████▎ \| 18/22 [03:28<00:42, 10.70s/it] {'loss': 2.4902, 'grad_norm': 4.15625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 674.14, 'epoch': 1.52}
	82%\|████████████████████████████████████████████████████▎ \| 18/22 [03:28<00:42, 10.70s/it][2025-10-25 17:53:34,885] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-18
	86%\|███████████████████████████████████████████████████████▎ \| 19/22 [03:48<00:40, 13.41s/it] {'loss': 2.4657, 'grad_norm': 3.90625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 655.58, 'epoch': 1.61}
	86%\|███████████████████████████████████████████████████████▎ \| 19/22 [03:48<00:40, 13.41s/it] 91%\|██████████████████████████████████████████████████████████▏ \| 20/22 [03:58<00:24, 12.42s/it] {'loss': 2.4085, 'grad_norm': 4.0625, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 709.39, 'epoch': 1.7}
	91%\|██████████████████████████████████████████████████████████▏ \| 20/22 [03:58<00:24, 12.42s/it] 95%\|█████████████████████████████████████████████████████████████ \| 21/22 [04:08<00:11, 11.73s/it] {'loss': 2.3577, 'grad_norm': 3.796875, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 691.03, 'epoch': 1.78}
	95%\|█████████████████████████████████████████████████████████████ \| 21/22 [04:08<00:11, 11.73s/it] 100%\|████████████████████████████████████████████████████████████████\| 22/22 [04:19<00:00, 11.26s/it] {'loss': 2.3456, 'grad_norm': 3.953125, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 18.65, 'memory/max_allocated (GiB)': 18.65, 'memory/device_reserved (GiB)': 21.19, 'tokens_per_second_per_gpu': 707.03, 'epoch': 1.87}
	100%\|████████████████████████████████████████████████████████████████\| 22/22 [04:19<00:00, 11.26s/it][2025-10-25 17:54:25,016] [INFO] [axolotl.core.trainers.base._save:665] [PID:4001] Saving model checkpoint to ./model-output/checkpoint-22
	{'train_runtime': 271.0135, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.081, 'train_loss': 2.643556302244013, 'memory/max_active (GiB)': 7.67, 'memory/max_allocated (GiB)': 7.67, 'memory/device_reserved (GiB)': 21.19, 'epoch': 1.87}
	100%\|████████████████████████████████████████████████████████████████\| 22/22 [04:28<00:00, 11.26s/it] 100%\|████████████████████████████████████████████████████████████████\| 22/22 [04:28<00:00, 12.21s/it]
	[2025-10-25 17:54:34,884] [INFO] [axolotl.train.save_trained_model:218] [PID:4001] Training completed! Saving trained model to ./model-output.
	[2025-10-25 17:54:44,496] [INFO] [axolotl.train.save_trained_model:336] [PID:4001] Model successfully saved to ./model-output
	[0m