AutomatedScientist's picture
Upload folder using huggingface_hub
dfa42f5 verified
Raw
History Blame Contribute Delete
48.8 kB
[2026-03-21 12:10:47,814] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1904493] baseline 0.000GB ()
[2026-03-21 12:10:47,814] [INFO] [axolotl.cli.config.load_cfg:259] [PID:1904493] config:
{
"activation_offloading": false,
"adapter": "lora",
"axolotl_config_path": "out/qwen3-8b-persistent-20260321_120850/axolotl_config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 64,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": true,
"n_gpu": 4,
"n_node": 1
},
"context_parallel_size": 1,
"dataloader_num_workers": 4,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 288,
"dataset_prepared_path": "out/prepared_dataset_persistent",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "/e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/paired/train/out/paired_data/persistent/traces.jsonl",
"roles_to_train": [
"assistant"
],
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": true,
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.10.0"
},
"eval_batch_size": 1,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_steps": 5,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"flash_attention": true,
"fp16": false,
"gradient_accumulation_steps": 16,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"include_tkps": true,
"is_falcon_derived_model": false,
"is_llama_derived_model": false,
"is_mistral_derived_model": false,
"learning_rate": 0.0001,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_alpha": 128,
"lora_dropout": 0.05,
"lora_r": 64,
"lora_target_linear": false,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"micro_batch_size": 1,
"model_config_type": "qwen3",
"num_epochs": 3.0,
"optimizer": "adamw_torch",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "out/qwen3-8b-persistent-20260321_120850",
"pad_to_sequence_len": true,
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": false,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_strategy": "epoch",
"save_total_limit": 3,
"seed": 3407,
"sequence_len": 16384,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"tokenizer_type": "AutoTokenizer",
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"trust_remote_code": true,
"type_of_model": "AutoModelForCausalLM",
"use_otel_metrics": false,
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.04,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"wandb_project": "pythonformer",
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"world_size": 4
}
[2026-03-21 12:10:47,816] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:1904493] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.
[2026-03-21 12:10:48,089] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:1904493] EOS: 151645 / <|im_end|>
[2026-03-21 12:10:48,089] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:1904493] BOS: None / None
[2026-03-21 12:10:48,089] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:1904493] PAD: 151643 / <|endoftext|>
[2026-03-21 12:10:48,089] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:1904493] UNK: None / None
[2026-03-21 12:10:48,112] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:1904493] Loading prepared dataset from disk at out/prepared_dataset_persistent/ef37a940287be0bb3ec73aa7dbd8c0f8...
[2026-03-21 12:10:48,155] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:1904493] total_num_tokens: 4_453_922
[2026-03-21 12:10:48,201] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:1904493] `total_supervised_tokens: 1_868_710`
[2026-03-21 12:10:48,202] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:1904493] total_num_steps: 45
[2026-03-21 12:10:48,202] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:1904493] Maximum number of steps set at 45
[2026-03-21 12:10:48,236] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:1904493] loading tokenizer... Qwen/Qwen3-8B
[2026-03-21 12:10:48,476] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:1904493] EOS: 151645 / <|im_end|>
[2026-03-21 12:10:48,477] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:1904493] BOS: None / None
[2026-03-21 12:10:48,477] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:1904493] PAD: 151643 / <|endoftext|>
[2026-03-21 12:10:48,477] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:1904493] UNK: None / None
[2026-03-21 12:10:48,477] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:1904493] Loading model
[2026-03-21 12:10:48,491] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1904493] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-21 12:10:48,492] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1904493] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s] Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s] Loading checkpoint shards: 20%|β–ˆβ–ˆ | 1/5 [00:02<00:10, 2.71s/it] Loading checkpoint shards: 20%|β–ˆβ–ˆ | 1/5 [00:02<00:10, 2.71s/it] Loading checkpoint shards: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 2/5 [00:05<00:08, 2.68s/it] Loading checkpoint shards: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 2/5 [00:05<00:08, 2.68s/it] Loading checkpoint shards: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/5 [00:07<00:04, 2.30s/it] Loading checkpoint shards: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/5 [00:07<00:04, 2.30s/it] Loading checkpoint shards: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4/5 [00:08<00:01, 1.99s/it] Loading checkpoint shards: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4/5 [00:08<00:01, 1.99s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:09<00:00, 1.49s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:09<00:00, 1.87s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:09<00:00, 1.49s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:09<00:00, 1.87s/it]
[2026-03-21 12:10:58,655] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:853] [PID:1904493] converting PEFT model w/ prepare_model_for_kbit_training
[2026-03-21 12:10:58,768] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:1904493] Converting modules to torch.bfloat16
[2026-03-21 12:10:58,779] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1904493] Memory usage after model load 31.673GB (+31.673GB allocated, +33.244GB reserved)
trainable params: 174,587,904 || all params: 8,365,323,264 || trainable%: 2.0870
[2026-03-21 12:10:59,511] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1904493] after adapters 28.849GB (+28.849GB allocated, +33.578GB reserved)
[2026-03-21 12:11:11,441] [INFO] [axolotl.train.save_initial_configs:413] [PID:1904493] Pre-saving adapter config to out/qwen3-8b-persistent-20260321_120850...
[2026-03-21 12:11:11,442] [INFO] [axolotl.train.save_initial_configs:417] [PID:1904493] Pre-saving tokenizer to out/qwen3-8b-persistent-20260321_120850...
[2026-03-21 12:11:11,589] [INFO] [axolotl.train.save_initial_configs:422] [PID:1904493] Pre-saving model config to out/qwen3-8b-persistent-20260321_120850...
[2026-03-21 12:11:11,592] [INFO] [axolotl.train.execute_training:212] [PID:1904493] Starting trainer...
wandb: Tracking run with wandb version 0.24.2
wandb: W&B syncing is set to `offline` in this directory. Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.
wandb: Run data is saved locally in /e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/wandb/offline-run-20260321_121114-xmnrcx1i
wandb: Detected [huggingface_hub.inference] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-21 12:11:16,022] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:1904493] The Axolotl config has been saved to the WandB run under files.
0%| | 0/45 [00:00<?, ?it/s][2026-03-21 12:11:16,024] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:00<00:03, 2.49it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.61it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.40it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:03, 1.29it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.24it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.19it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.18it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.17it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:07<00:00, 1.16it/s]
{'eval_loss': 1.023712396621704, 'eval_runtime': 9.3181, 'eval_samples_per_second': 4.293, 'eval_steps_per_second': 1.073, 'eval_ppl': 2.78351, 'memory/max_active (GiB)': 53.19, 'memory/max_allocated (GiB)': 53.19, 'memory/device_reserved (GiB)': 56.52, 'epoch': 0}
0%| | 0/45 [00:09<?, ?it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:07<00:00, 1.16it/s]
 2%|▏ | 1/45 [00:58<42:40, 58.19s/it] {'loss': 0.9746, 'grad_norm': 2.3994593620300293, 'learning_rate': 0.0, 'ppl': 2.65011, 'memory/max_active (GiB)': 62.81, 'memory/max_allocated (GiB)': 62.81, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 182.9156951904297, 'tokens/total': 1048576, 'tokens/trainable': 133165, 'epoch': 0.07}
2%|▏ | 1/45 [00:58<42:40, 58.19s/it] 4%|▍ | 2/45 [01:46<37:28, 52.29s/it] {'loss': 1.0113, 'grad_norm': 2.5807816982269287, 'learning_rate': 5e-05, 'ppl': 2.74917, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 125.66553497314453, 'tokens/total': 2097152, 'tokens/trainable': 259063, 'epoch': 0.13}
4%|▍ | 2/45 [01:46<37:28, 52.29s/it] 7%|β–‹ | 3/45 [02:34<35:15, 50.37s/it] {'loss': 0.8778, 'grad_norm': 1.2180739641189575, 'learning_rate': 0.0001, 'ppl': 2.4056, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 132.6904296875, 'tokens/total': 3145728, 'tokens/trainable': 380399, 'epoch': 0.2}
7%|β–‹ | 3/45 [02:34<35:15, 50.37s/it] 9%|β–‰ | 4/45 [03:22<33:50, 49.52s/it] {'loss': 0.7532, 'grad_norm': 0.3355015218257904, 'learning_rate': 9.986661418317759e-05, 'ppl': 2.12379, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 123.76305389404297, 'tokens/total': 4194304, 'tokens/trainable': 505772, 'epoch': 0.27}
9%|β–‰ | 4/45 [03:22<33:50, 49.52s/it] 11%|β–ˆ | 5/45 [04:10<32:41, 49.05s/it] {'loss': 0.7196, 'grad_norm': 0.272516667842865, 'learning_rate': 9.946716840375551e-05, 'ppl': 2.05361, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 178.20230102539062, 'tokens/total': 5242880, 'tokens/trainable': 631551, 'epoch': 0.33}
11%|β–ˆ | 5/45 [04:10<32:41, 49.05s/it][2026-03-21 12:15:26,893] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.69it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:02<00:05, 1.35it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:03<00:04, 1.26it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.20it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.17it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.15it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.15it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.14it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.7008563876152039, 'eval_runtime': 9.3746, 'eval_samples_per_second': 4.267, 'eval_steps_per_second': 1.067, 'eval_ppl': 2.01548, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 0.33, 'tokens/train_per_sec_per_gpu': 0.0}
11%|β–ˆ | 5/45 [04:20<32:41, 49.05s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 13%|β–ˆβ–Ž | 6/45 [05:08<33:50, 52.07s/it] {'loss': 0.6826, 'grad_norm': 0.24676384031772614, 'learning_rate': 9.880379387779637e-05, 'ppl': 1.97902, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 170.88426208496094, 'tokens/total': 6291456, 'tokens/trainable': 751279, 'epoch': 0.4}
13%|β–ˆβ–Ž | 6/45 [05:08<33:50, 52.07s/it] 16%|β–ˆβ–Œ | 7/45 [05:57<32:12, 50.85s/it] {'loss': 0.6878, 'grad_norm': 0.2530628740787506, 'learning_rate': 9.78800299954203e-05, 'ppl': 1.98933, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 136.3604278564453, 'tokens/total': 7340032, 'tokens/trainable': 871188, 'epoch': 0.47}
16%|β–ˆβ–Œ | 7/45 [05:57<32:12, 50.85s/it] 18%|β–ˆβ–Š | 8/45 [06:45<30:49, 50.00s/it] {'loss': 0.6489, 'grad_norm': 0.23260381817817688, 'learning_rate': 9.67008054366274e-05, 'ppl': 1.91343, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 137.96417236328125, 'tokens/total': 8388608, 'tokens/trainable': 1005318, 'epoch': 0.53}
18%|β–ˆβ–Š | 8/45 [06:45<30:49, 50.00s/it] 20%|β–ˆβ–ˆ | 9/45 [07:33<29:36, 49.35s/it] {'loss': 0.6325, 'grad_norm': 0.2275058925151825, 'learning_rate': 9.527241187465734e-05, 'ppl': 1.88231, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 176.22164916992188, 'tokens/total': 9437184, 'tokens/trainable': 1115945, 'epoch': 0.6}
20%|β–ˆβ–ˆ | 9/45 [07:33<29:36, 49.35s/it] 22%|β–ˆβ–ˆβ– | 10/45 [08:21<28:38, 49.09s/it] {'loss': 0.5896, 'grad_norm': 0.16890013217926025, 'learning_rate': 9.360247040719039e-05, 'ppl': 1.80327, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 237.16334533691406, 'tokens/total': 10485760, 'tokens/trainable': 1244402, 'epoch': 0.67}
22%|β–ˆβ–ˆβ– | 10/45 [08:21<28:38, 49.09s/it][2026-03-21 12:19:37,764] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.84it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:02<00:04, 1.41it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.29it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.22it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.19it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.15it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.15it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.5861338376998901, 'eval_runtime': 9.2873, 'eval_samples_per_second': 4.307, 'eval_steps_per_second': 1.077, 'eval_ppl': 1.79703, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 0.67, 'tokens/train_per_sec_per_gpu': 0.0}
22%|β–ˆβ–ˆβ– | 10/45 [08:31<28:38, 49.09s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 24%|β–ˆβ–ˆβ– | 11/45 [09:20<29:29, 52.03s/it] {'loss': 0.5798, 'grad_norm': 0.1342519223690033, 'learning_rate': 9.16998908944939e-05, 'ppl': 1.78568, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 166.65391540527344, 'tokens/total': 11534336, 'tokens/trainable': 1379886, 'epoch': 0.73}
24%|β–ˆβ–ˆβ– | 11/45 [09:20<29:29, 52.03s/it] 27%|β–ˆβ–ˆβ–‹ | 12/45 [10:08<27:57, 50.83s/it] {'loss': 0.5796, 'grad_norm': 0.13311029970645905, 'learning_rate': 8.957482442146272e-05, 'ppl': 1.78532, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 134.82778930664062, 'tokens/total': 12582912, 'tokens/trainable': 1498175, 'epoch': 0.8}
27%|β–ˆβ–ˆβ–‹ | 12/45 [10:08<27:57, 50.83s/it] 29%|β–ˆβ–ˆβ–‰ | 13/45 [10:56<26:39, 49.97s/it] {'loss': 0.5531, 'grad_norm': 0.13393263518810272, 'learning_rate': 8.72386091371891e-05, 'ppl': 1.73863, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 173.0598907470703, 'tokens/total': 13631488, 'tokens/trainable': 1620026, 'epoch': 0.87}
29%|β–ˆβ–ˆβ–‰ | 13/45 [10:56<26:39, 49.97s/it] 31%|β–ˆβ–ˆβ–ˆ | 14/45 [11:44<25:35, 49.52s/it] {'loss': 0.5481, 'grad_norm': 0.18279989063739777, 'learning_rate': 8.47037097610317e-05, 'ppl': 1.72996, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 108.0434341430664, 'tokens/total': 14680064, 'tokens/trainable': 1740387, 'epoch': 0.93}
31%|β–ˆβ–ˆβ–ˆ | 14/45 [11:44<25:35, 49.52s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 15/45 [12:33<24:35, 49.19s/it] {'loss': 0.5379, 'grad_norm': 0.13692247867584229, 'learning_rate': 8.198365107794457e-05, 'ppl': 1.71241, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 159.22552490234375, 'tokens/total': 15728640, 'tokens/trainable': 1868710, 'epoch': 1.0}
33%|β–ˆβ–ˆβ–ˆβ–Ž | 15/45 [12:33<24:35, 49.19s/it][2026-03-21 12:23:49,441] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:00<00:03, 2.19it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.51it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.34it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.25it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.21it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.17it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.16it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.527126133441925, 'eval_runtime': 9.2917, 'eval_samples_per_second': 4.305, 'eval_steps_per_second': 1.076, 'eval_ppl': 1.69406, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 1.0, 'tokens/train_per_sec_per_gpu': 0.0}
33%|β–ˆβ–ˆβ–ˆβ–Ž | 15/45 [12:42<24:35, 49.19s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
[2026-03-21 12:23:58,740] [INFO] [axolotl.core.trainers.base._save:721] [PID:1904493] Saving model checkpoint to out/qwen3-8b-persistent-20260321_120850/checkpoint-15
[2026-03-21 12:23:58,770] [WARNING] [py.warnings._showwarnmsg:112] [PID:1904493] /e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/.venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:295: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
warnings.warn(
36%|β–ˆβ–ˆβ–ˆβ–Œ | 16/45 [13:33<25:23, 52.55s/it] {'loss': 0.498, 'grad_norm': 0.12188886106014252, 'learning_rate': 7.909294577789766e-05, 'ppl': 1.64543, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 180.2497100830078, 'tokens/total': 16777216, 'tokens/trainable': 1994404, 'epoch': 1.07}
36%|β–ˆβ–ˆβ–ˆβ–Œ | 16/45 [13:33<25:23, 52.55s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 17/45 [14:21<23:55, 51.25s/it] {'loss': 0.5066, 'grad_norm': 0.12189345806837082, 'learning_rate': 7.604701702439651e-05, 'ppl': 1.65964, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 171.74996948242188, 'tokens/total': 17825792, 'tokens/trainable': 2121130, 'epoch': 1.13}
38%|β–ˆβ–ˆβ–ˆβ–Š | 17/45 [14:22<23:55, 51.25s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 18/45 [15:11<22:48, 50.67s/it] {'loss': 0.5064, 'grad_norm': 0.10782640427350998, 'learning_rate': 7.286211616523193e-05, 'ppl': 1.65931, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 166.5288543701172, 'tokens/total': 18874368, 'tokens/trainable': 2250078, 'epoch': 1.2}
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 18/45 [15:11<22:48, 50.67s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 19/45 [15:59<21:36, 49.88s/it] {'loss': 0.5113, 'grad_norm': 0.11023414880037308, 'learning_rate': 6.95552360245078e-05, 'ppl': 1.66746, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 144.7275848388672, 'tokens/total': 19922944, 'tokens/trainable': 2365801, 'epoch': 1.27}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 19/45 [15:59<21:36, 49.88s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20/45 [16:48<20:38, 49.54s/it] {'loss': 0.4998, 'grad_norm': 0.10430668294429779, 'learning_rate': 6.614402023857232e-05, 'ppl': 1.64839, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 167.75204467773438, 'tokens/total': 20971520, 'tokens/trainable': 2492718, 'epoch': 1.33}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20/45 [16:48<20:38, 49.54s/it][2026-03-21 12:28:04,133] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.91it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.43it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.30it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.23it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.19it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.15it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.4944220185279846, 'eval_runtime': 9.2506, 'eval_samples_per_second': 4.324, 'eval_steps_per_second': 1.081, 'eval_ppl': 1.63955, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 1.33, 'tokens/train_per_sec_per_gpu': 0.0}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 20/45 [16:57<20:38, 49.54s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/45 [17:45<20:46, 51.94s/it] {'loss': 0.5083, 'grad_norm': 0.10462717711925507, 'learning_rate': 6.264666911958404e-05, 'ppl': 1.66246, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 179.36976623535156, 'tokens/total': 22020096, 'tokens/trainable': 2618366, 'epoch': 1.4}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/45 [17:45<20:46, 51.94s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 22/45 [18:34<19:30, 50.87s/it] {'loss': 0.5062, 'grad_norm': 0.10122773051261902, 'learning_rate': 5.908184254897182e-05, 'ppl': 1.65898, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 145.87869262695312, 'tokens/total': 23068672, 'tokens/trainable': 2733139, 'epoch': 1.47}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 22/45 [18:34<19:30, 50.87s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/45 [19:22<18:22, 50.12s/it] {'loss': 0.4791, 'grad_norm': 0.1740027815103531, 'learning_rate': 5.546856041889373e-05, 'ppl': 1.61462, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 151.44520568847656, 'tokens/total': 24117248, 'tokens/trainable': 2855825, 'epoch': 1.53}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/45 [19:22<18:22, 50.12s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/45 [20:11<17:26, 49.84s/it] {'loss': 0.4809, 'grad_norm': 0.08534331619739532, 'learning_rate': 5.182610115288295e-05, 'ppl': 1.61753, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 126.7411880493164, 'tokens/total': 25165824, 'tokens/trainable': 2989569, 'epoch': 1.6}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/45 [20:11<17:26, 49.84s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/45 [20:59<16:27, 49.37s/it] {'loss': 0.4986, 'grad_norm': 0.08693187683820724, 'learning_rate': 4.817389884711705e-05, 'ppl': 1.64641, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 131.96170043945312, 'tokens/total': 26214400, 'tokens/trainable': 3108746, 'epoch': 1.67}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/45 [20:59<16:27, 49.37s/it][2026-03-21 12:32:15,866] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.96it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.44it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.31it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.23it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.19it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.16it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.4709378182888031, 'eval_runtime': 9.2231, 'eval_samples_per_second': 4.337, 'eval_steps_per_second': 1.084, 'eval_ppl': 1.6015, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 1.67, 'tokens/train_per_sec_per_gpu': 0.0}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/45 [21:09<16:27, 49.37s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/45 [21:57<16:27, 51.95s/it] {'loss': 0.4754, 'grad_norm': 0.07740820199251175, 'learning_rate': 4.4531439581106295e-05, 'ppl': 1.60866, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 137.57606506347656, 'tokens/total': 27262976, 'tokens/trainable': 3240635, 'epoch': 1.73}
58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/45 [21:57<16:27, 51.95s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/45 [22:46<15:15, 50.84s/it] {'loss': 0.4503, 'grad_norm': 0.08195707201957703, 'learning_rate': 4.0918157451028185e-05, 'ppl': 1.56878, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 183.06039428710938, 'tokens/total': 28311552, 'tokens/trainable': 3359929, 'epoch': 1.8}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/45 [22:46<15:15, 50.84s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 28/45 [23:34<14:12, 50.17s/it] {'loss': 0.4803, 'grad_norm': 0.08149150758981705, 'learning_rate': 3.735333088041596e-05, 'ppl': 1.61656, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 169.7770538330078, 'tokens/total': 29360128, 'tokens/trainable': 3490302, 'epoch': 1.87}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 28/45 [23:34<14:12, 50.17s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 29/45 [24:22<13:12, 49.56s/it] {'loss': 0.4571, 'grad_norm': 0.08145039528608322, 'learning_rate': 3.38559797614277e-05, 'ppl': 1.57949, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 125.50889587402344, 'tokens/total': 30408704, 'tokens/trainable': 3610685, 'epoch': 1.93}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 29/45 [24:22<13:12, 49.56s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 30/45 [25:11<12:21, 49.43s/it] {'loss': 0.4447, 'grad_norm': 0.07428699731826782, 'learning_rate': 3.0444763975492208e-05, 'ppl': 1.56002, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 191.67347717285156, 'tokens/total': 31457280, 'tokens/trainable': 3737420, 'epoch': 2.0}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 30/45 [25:11<12:21, 49.43s/it][2026-03-21 12:36:27,947] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:00<00:03, 2.10it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.49it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.33it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.24it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.20it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.16it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.4570392966270447, 'eval_runtime': 9.2728, 'eval_samples_per_second': 4.314, 'eval_steps_per_second': 1.078, 'eval_ppl': 1.57939, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 2.0, 'tokens/train_per_sec_per_gpu': 0.0}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 30/45 [25:21<12:21, 49.43s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
[2026-03-21 12:36:37,228] [INFO] [axolotl.core.trainers.base._save:721] [PID:1904493] Saving model checkpoint to out/qwen3-8b-persistent-20260321_120850/checkpoint-30
[2026-03-21 12:36:37,246] [WARNING] [py.warnings._showwarnmsg:112] [PID:1904493] /e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/.venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:295: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
warnings.warn(
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 31/45 [26:12<12:19, 52.85s/it] {'loss': 0.4605, 'grad_norm': 0.07451663911342621, 'learning_rate': 2.7137883834768073e-05, 'ppl': 1.58487, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 143.8233642578125, 'tokens/total': 32505856, 'tokens/trainable': 3879393, 'epoch': 2.07}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 31/45 [26:12<12:19, 52.85s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 32/45 [27:00<11:08, 51.40s/it] {'loss': 0.4552, 'grad_norm': 0.08845459669828415, 'learning_rate': 2.3952982975603496e-05, 'ppl': 1.57649, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 141.9944305419922, 'tokens/total': 33554432, 'tokens/trainable': 3999230, 'epoch': 2.13}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 32/45 [27:00<11:08, 51.40s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 33/45 [27:48<10:04, 50.35s/it] {'loss': 0.4625, 'grad_norm': 0.07337811589241028, 'learning_rate': 2.090705422210237e-05, 'ppl': 1.58804, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 125.76362609863281, 'tokens/total': 34603008, 'tokens/trainable': 4123262, 'epoch': 2.2}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 33/45 [27:48<10:04, 50.35s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 34/45 [28:36<09:06, 49.65s/it] {'loss': 0.4357, 'grad_norm': 0.07691735774278641, 'learning_rate': 1.801634892205545e-05, 'ppl': 1.54604, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 140.31842041015625, 'tokens/total': 35651584, 'tokens/trainable': 4232270, 'epoch': 2.27}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 34/45 [28:36<09:06, 49.65s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 35/45 [29:25<08:13, 49.39s/it] {'loss': 0.444, 'grad_norm': 0.06961730122566223, 'learning_rate': 1.5296290238968303e-05, 'ppl': 1.55893, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 142.96621704101562, 'tokens/total': 36700160, 'tokens/trainable': 4358382, 'epoch': 2.33}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 35/45 [29:25<08:13, 49.39s/it][2026-03-21 12:40:41,769] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.95it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.44it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.30it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.23it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.19it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.15it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.4494328498840332, 'eval_runtime': 9.2342, 'eval_samples_per_second': 4.332, 'eval_steps_per_second': 1.083, 'eval_ppl': 1.56742, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 2.33, 'tokens/train_per_sec_per_gpu': 0.0}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 35/45 [29:34<08:13, 49.39s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 36/45 [30:22<07:46, 51.83s/it] {'loss': 0.4496, 'grad_norm': 0.0727730318903923, 'learning_rate': 1.2761390862810907e-05, 'ppl': 1.56768, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 171.24005126953125, 'tokens/total': 37748736, 'tokens/trainable': 4475221, 'epoch': 2.4}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 36/45 [30:22<07:46, 51.83s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 37/45 [31:11<06:46, 50.87s/it] {'loss': 0.4438, 'grad_norm': 0.0709051787853241, 'learning_rate': 1.0425175578537299e-05, 'ppl': 1.55862, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 213.58218383789062, 'tokens/total': 38797312, 'tokens/trainable': 4605991, 'epoch': 2.47}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 37/45 [31:11<06:46, 50.87s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 38/45 [32:00<05:51, 50.17s/it] {'loss': 0.4512, 'grad_norm': 0.0714455395936966, 'learning_rate': 8.30010910550611e-06, 'ppl': 1.5702, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 151.91529846191406, 'tokens/total': 39845888, 'tokens/trainable': 4721248, 'epoch': 2.53}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 38/45 [32:00<05:51, 50.17s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 39/45 [32:49<04:58, 49.79s/it] {'loss': 0.4353, 'grad_norm': 0.07068906724452972, 'learning_rate': 6.397529592809614e-06, 'ppl': 1.54543, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 149.29034423828125, 'tokens/total': 40894464, 'tokens/trainable': 4849107, 'epoch': 2.6}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 39/45 [32:49<04:58, 49.79s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 40/45 [33:37<04:07, 49.44s/it] {'loss': 0.4234, 'grad_norm': 0.06998934596776962, 'learning_rate': 4.727588125342669e-06, 'ppl': 1.52715, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 153.89173889160156, 'tokens/total': 41943040, 'tokens/trainable': 4966302, 'epoch': 2.67}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 40/45 [33:37<04:07, 49.44s/it][2026-03-21 12:44:53,723] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:01<00:04, 1.89it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:02<00:04, 1.42it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.29it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.22it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.19it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.15it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.44656768441200256, 'eval_runtime': 9.263, 'eval_samples_per_second': 4.318, 'eval_steps_per_second': 1.08, 'eval_ppl': 1.56294, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 2.67, 'tokens/train_per_sec_per_gpu': 0.0}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 40/45 [33:46<04:07, 49.44s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 41/45 [34:35<03:27, 51.83s/it] {'loss': 0.424, 'grad_norm': 0.07179196178913116, 'learning_rate': 3.299194563372604e-06, 'ppl': 1.52806, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 146.40927124023438, 'tokens/total': 42991616, 'tokens/trainable': 5079587, 'epoch': 2.73}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 41/45 [34:35<03:27, 51.83s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 42/45 [35:24<02:32, 50.97s/it] {'loss': 0.4713, 'grad_norm': 0.06713048368692398, 'learning_rate': 2.1199700045797077e-06, 'ppl': 1.60208, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 185.44821166992188, 'tokens/total': 44040192, 'tokens/trainable': 5213935, 'epoch': 2.8}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 42/45 [35:24<02:32, 50.97s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 43/45 [36:12<01:40, 50.19s/it] {'loss': 0.4439, 'grad_norm': 0.07158804684877396, 'learning_rate': 1.196206122203647e-06, 'ppl': 1.55877, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 152.13075256347656, 'tokens/total': 45088768, 'tokens/trainable': 5343493, 'epoch': 2.87}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 43/45 [36:12<01:40, 50.19s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 44/45 [37:01<00:49, 49.98s/it] {'loss': 0.4697, 'grad_norm': 0.07157603651285172, 'learning_rate': 5.328315962444874e-07, 'ppl': 1.59951, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 166.90415954589844, 'tokens/total': 46137344, 'tokens/trainable': 5482217, 'epoch': 2.93}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 44/45 [37:01<00:49, 49.98s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 45/45 [37:50<00:00, 49.49s/it] {'loss': 0.4383, 'grad_norm': 0.06905380636453629, 'learning_rate': 1.333858168224178e-07, 'ppl': 1.55007, 'memory/max_active (GiB)': 64.12, 'memory/max_allocated (GiB)': 64.12, 'memory/device_reserved (GiB)': 66.97, 'tokens/train_per_sec_per_gpu': 164.1426544189453, 'tokens/total': 47185920, 'tokens/trainable': 5606130, 'epoch': 3.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 45/45 [37:50<00:00, 49.49s/it][2026-03-21 12:49:06,280] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:1904493] Running evaluation step...
0%| | 0/10 [00:00<?, ?it/s]
20%|β–ˆβ–ˆ | 2/10 [00:00<00:03, 2.10it/s]
30%|β–ˆβ–ˆβ–ˆ | 3/10 [00:01<00:04, 1.48it/s]
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 4/10 [00:02<00:04, 1.32it/s]
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5/10 [00:03<00:04, 1.24it/s]
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/10 [00:04<00:03, 1.20it/s]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 7/10 [00:05<00:02, 1.16it/s]
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 8/10 [00:06<00:01, 1.16it/s]
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 9/10 [00:07<00:00, 1.15it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
{'eval_loss': 0.4460541605949402, 'eval_runtime': 9.2732, 'eval_samples_per_second': 4.314, 'eval_steps_per_second': 1.078, 'eval_ppl': 1.56214, 'memory/max_active (GiB)': 54.54, 'memory/max_allocated (GiB)': 54.54, 'memory/device_reserved (GiB)': 66.97, 'epoch': 3.0, 'tokens/train_per_sec_per_gpu': 0.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 45/45 [37:59<00:00, 49.49s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:08<00:00, 1.14it/s]
[2026-03-21 12:49:15,561] [INFO] [axolotl.core.trainers.base._save:721] [PID:1904493] Saving model checkpoint to out/qwen3-8b-persistent-20260321_120850/checkpoint-45
[2026-03-21 12:49:15,579] [WARNING] [py.warnings._showwarnmsg:112] [PID:1904493] /e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/.venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:295: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
warnings.warn(
{'train_runtime': 2282.7571, 'train_samples_per_second': 1.262, 'train_steps_per_second': 0.02, 'train_loss': 0.541955092880461, 'memory/max_active (GiB)': 30.88, 'memory/max_allocated (GiB)': 30.88, 'memory/device_reserved (GiB)': 66.97, 'epoch': 3.0, 'tokens/train_per_sec_per_gpu': 0.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 45/45 [38:01<00:00, 49.49s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 45/45 [38:01<00:00, 50.69s/it]
[2026-03-21 12:49:17,350] [INFO] [axolotl.train.save_trained_model:233] [PID:1904493] Training completed! Saving trained model to out/qwen3-8b-persistent-20260321_120850.
[2026-03-21 12:49:17,369] [WARNING] [py.warnings._showwarnmsg:112] [PID:1904493] /e/project1/reformo/salgarkar1/agents_learn/pythonformer-workshop/.venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:295: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
warnings.warn(
[2026-03-21 12:49:17,816] [INFO] [axolotl.train.save_trained_model:351] [PID:1904493] Model successfully saved to out/qwen3-8b-persistent-20260321_120850