muse-marvin-lora-2 / debug.log
ToastyPigeon's picture
End of training
36285fc verified
[2025-10-07 11:50:13,057] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:8314] bf16 support detected, enabling for this configuration.
[2025-10-07 11:50:13,281] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:8314] baseline 0.000GB ()
[2025-10-07 11:50:13,282] [INFO] [axolotl.cli.config.load_cfg:248] [PID:8314] config:
{
"activation_offloading": false,
"adapter": "qlora",
"axolotl_config_path": "muse-marvin-attn.yaml",
"base_model": "LatitudeGames/Muse-12B",
"base_model_config": "LatitudeGames/Muse-12B",
"batch_size": 8,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_86",
"fp8": false,
"n_gpu": 2,
"n_node": 1
},
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 2,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_prepared_path": "last_run_prepared",
"dataset_processes": 24,
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "conversations",
"message_property_mappings": {
"content": "value",
"role": "from"
},
"path": "grimulkan/LimaRP-augmented",
"trust_remote_code": false,
"type": "chat_template"
},
{
"data_files": "marvin.json",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "ToastyPigeon/steve-and-marvin",
"trust_remote_code": false,
"type": "completion"
},
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "ToastyPigeon/kimi-stories-completion",
"trust_remote_code": false,
"type": "completion"
}
],
"ddp": true,
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.7.1"
},
"eval_batch_size": 1,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_steps": 0.1,
"eval_table_size": 0,
"evals_per_epoch": 10,
"experimental_skip_move_to_device": true,
"flash_attention": true,
"fp16": false,
"fsdp": [
"full_shard",
"auto_wrap"
],
"fsdp_config": {
"activation_checkpointing": true,
"auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"cpu_ram_efficient_loading": true,
"limit_all_gathers": true,
"offload_params": true,
"sharding_strategy": "FULL_SHARD",
"state_dict_type": "FULL_STATE_DICT",
"sync_module_states": true,
"transformer_layer_cls_to_wrap": "MistralDecoderLayer",
"use_orig_params": false
},
"gc_steps": 10,
"gradient_accumulation_steps": 4,
"gradient_checkpointing": false,
"group_by_length": false,
"hub_model_id": "ToastyPigeon/muse-marvin-lora-2",
"hub_strategy": "every_save",
"include_tkps": true,
"is_mistral_derived_model": true,
"learning_rate": 1e-05,
"liger_glu_activation": true,
"liger_layer_norm": true,
"liger_rms_norm": true,
"liger_rope": true,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_alpha": 32,
"lora_dropout": 0.1,
"lora_r": 32,
"lora_target_linear": true,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"micro_batch_size": 1,
"model_config_type": "mistral",
"num_epochs": 1.0,
"optimizer": "adamw_torch_fused",
"output_dir": "ckpts-mmarv",
"pad_to_sequence_len": true,
"peft_use_rslora": false,
"plugins": [
"axolotl.integrations.liger.LigerPlugin",
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.1,
"save_total_limit": 1,
"saves_per_epoch": 10,
"seed": 69,
"sequence_len": 16384,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "LatitudeGames/Muse-12B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.025,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"wandb_name": "r32-qlora-all-linear",
"wandb_project": "MuseMarvin",
"warmup_ratio": 0.025,
"weight_decay": 0.01,
"world_size": 2
}
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8314] EOS: 131072 / <|im_end|>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8314] BOS: 1 / <s>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8314] PAD: 10 / <pad>
[2025-10-07 11:50:14,275] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8314] UNK: 0 / <unk>
[2025-10-07 11:50:14,275] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:470] [PID:8314] Loading prepared dataset from disk at last_run_prepared/31b44b9f810943b30f3af91fc7580ba1...
[2025-10-07 11:50:14,288] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:8314] total_num_tokens: 758_181
[2025-10-07 11:50:14,290] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:8314] `total_supervised_tokens: 716_387`
[2025-10-07 11:50:15,905] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6418807506561279
[2025-10-07 11:50:16,550] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6455550193786621
[2025-10-07 11:50:17,196] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6447782516479492
[2025-10-07 11:50:17,842] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.6462986469268799
[2025-10-07 11:50:18,408] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
[2025-10-07 11:50:18,460] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:8314] data_loader_len: 5
[2025-10-07 11:50:18,469] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:8314] sample_packing_eff_est across ranks: [0.9845892786979675, 0.9845892786979675]
[2025-10-07 11:50:18,469] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:8314] sample_packing_eff_est: None
[2025-10-07 11:50:18,469] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:8314] total_num_steps: 5
[2025-10-07 11:50:18,562] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:8314] total_num_tokens: 30_240_821
[2025-10-07 11:50:18,664] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:8314] `total_supervised_tokens: 28_482_459`
[2025-10-07 11:50:20,128] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7220749855041504
[2025-10-07 11:50:20,839] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7103662490844727
[2025-10-07 11:50:21,559] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.719578742980957
[2025-10-07 11:50:22,294] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 0.7134625911712646
[2025-10-07 11:50:22,295] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [1861, 1860]
[2025-10-07 11:50:22,296] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:8314] data_loader_len: 232
[2025-10-07 11:50:22,296] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:8314] sample_packing_eff_est across ranks: [0.991807222366333, 0.991807222366333]
[2025-10-07 11:50:22,297] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:8314] sample_packing_eff_est: 1.0
[2025-10-07 11:50:22,297] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:8314] total_num_steps: 232
[2025-10-07 11:50:22,297] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:8314] Maximum number of steps set at 232
[2025-10-07 11:50:22,323] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:8314] Loading tokenizer... LatitudeGames/Muse-12B
[2025-10-07 11:50:23,211] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8314] EOS: 131072 / <|im_end|>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8314] BOS: 1 / <s>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8314] PAD: 10 / <pad>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8314] UNK: 0 / <unk>
[2025-10-07 11:50:23,212] [DEBUG] [axolotl.train.setup_model_and_tokenizer:79] [PID:8314] Loading model
[2025-10-07 11:50:23,364] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:8314] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-10-07 11:50:23,365] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:8314] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2025-10-07 11:50:23,365] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:8314] Applying multipack dataloader patch for sample packing...
[2025-10-07 11:50:23,385] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:8314] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True}
[2025-10-07 11:50:23,540] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:8314] Applying Cut Cross Entropy to model type: mistral
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s] Loading checkpoint shards: 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1/5 [00:04<00:18, 4.63s/it] Loading checkpoint shards: 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2/5 [00:09<00:14, 4.80s/it] Loading checkpoint shards: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 3/5 [00:14<00:09, 4.87s/it] Loading checkpoint shards: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4/5 [00:19<00:04, 4.89s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:23<00:00, 4.73s/it] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:23<00:00, 4.77s/it]
[2025-10-07 11:50:47,921] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:8314] Converting modules to torch.bfloat16
[2025-10-07 11:50:47,923] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8314] Memory usage after model load 5.750GB (+5.750GB allocated, +5.797GB reserved)
[2025-10-07 11:50:47,924] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:8314] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 114,032,640 || all params: 12,361,835,520 || trainable%: 0.9225
[2025-10-07 11:50:48,830] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8314] after adapters 0.000GB ()
[2025-10-07 11:50:52,955] [INFO] [axolotl.train.save_initial_configs:408] [PID:8314] Pre-saving adapter config to ckpts-mmarv...
[2025-10-07 11:50:52,977] [INFO] [axolotl.train.save_initial_configs:412] [PID:8314] Pre-saving tokenizer to ckpts-mmarv...
[2025-10-07 11:50:53,168] [INFO] [axolotl.train.save_initial_configs:417] [PID:8314] Pre-saving model config to ckpts-mmarv...
[2025-10-07 11:50:53,171] [INFO] [axolotl.train.execute_training:203] [PID:8314] Starting trainer...
[2025-10-07 11:51:05,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.541344404220581
[2025-10-07 11:51:06,738] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.559652328491211
[2025-10-07 11:51:08,262] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.5244674682617188
[2025-10-07 11:51:09,808] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.54478120803833
[2025-10-07 11:51:09,828] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [1860, 1860]
wandb: Currently logged in as: cooawoo (cooawoo-personal) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: β’Ώ Waiting for wandb.init()...
m wandb: β£» setting up run 0vwfkgsm (0.2s)
m wandb: Tracking run with wandb version 0.22.1
wandb: Run data is saved locally in /workspace/training/wandb/run-20251007_115115-0vwfkgsm
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run r32-qlora-all-linear
wandb: ⭐️ View project at https://wandb.ai/cooawoo-personal/MuseMarvin
wandb: πŸš€ View run at https://wandb.ai/cooawoo-personal/MuseMarvin/runs/0vwfkgsm
wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[2025-10-07 11:51:18,257] [INFO] [axolotl.utils.callbacks.on_train_begin:793] [PID:8314] The Axolotl config has been saved to the WandB run under files.
0%| | 0/232 [00:00<?, ?it/s][2025-10-07 11:51:18,258] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 11:51:21,179] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.3499112129211426
[2025-10-07 11:51:22,541] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.361877202987671
[2025-10-07 11:51:23,871] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.3298544883728027
[2025-10-07 11:51:25,162] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.290006399154663
[2025-10-07 11:51:25,163] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:07<01:21, 3.88s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:15<01:52, 5.65s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:05, 6.60s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:09, 7.19s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:07, 7.53s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:48<02:03, 7.72s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:56<01:57, 7.85s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.01s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.04s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.07s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:29<01:28, 8.08s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:37<01:21, 8.16s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.15s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.14s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.88s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:09<00:47, 7.98s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:17<00:40, 8.04s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:25<00:32, 8.06s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.08s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:42<00:16, 8.14s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:50<00:08, 8.15s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.28s/it]
{'eval_loss': 2.5322844982147217, 'eval_runtime': 203.0647, 'eval_samples_per_second': 0.359, 'eval_steps_per_second': 0.182, 'memory/max_active (GiB)': 8.04, 'memory/max_allocated (GiB)': 6.73, 'memory/device_reserved (GiB)': 8.36, 'epoch': 0}
0%| | 0/232 [03:29<?, ?it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.28s/it]
 0%|▏ | 1/232 [05:12<20:03:02, 312.48s/it] {'loss': 2.5109, 'grad_norm': 0.33189067244529724, 'learning_rate': 0.0, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 4214.45, 'epoch': 0.0}
0%|▏ | 1/232 [05:12<20:03:02, 312.48s/it] 1%|▍ | 2/232 [06:56<12:08:54, 190.15s/it] {'loss': 2.6357, 'grad_norm': 0.35216283798217773, 'learning_rate': 2.0000000000000003e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.16, 'epoch': 0.01}
1%|▍ | 2/232 [06:57<12:08:54, 190.15s/it] 1%|β–Œ | 3/232 [08:41<9:36:52, 151.15s/it] {'loss': 2.5537, 'grad_norm': 0.35557371377944946, 'learning_rate': 4.000000000000001e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.76, 'epoch': 0.01}
1%|β–Œ | 3/232 [08:41<9:36:52, 151.15s/it] 2%|β–Š | 4/232 [10:25<8:23:35, 132.52s/it] {'loss': 2.5444, 'grad_norm': 0.3299483060836792, 'learning_rate': 6e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 508.06, 'epoch': 0.02}
2%|β–Š | 4/232 [10:25<8:23:35, 132.52s/it] 2%|β–‰ | 5/232 [12:06<7:37:27, 120.91s/it] {'loss': 2.5188, 'grad_norm': 0.30499881505966187, 'learning_rate': 8.000000000000001e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.1, 'epoch': 0.02}
2%|β–‰ | 5/232 [12:06<7:37:27, 120.91s/it] 3%|β–ˆβ– | 6/232 [13:49<7:12:55, 114.94s/it] {'loss': 2.5716, 'grad_norm': 0.3081396818161011, 'learning_rate': 1e-05, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 615.36, 'epoch': 0.03}
3%|β–ˆβ– | 6/232 [13:49<7:12:55, 114.94s/it] 3%|β–ˆβ– | 7/232 [15:34<6:58:49, 111.69s/it] {'loss': 2.5446, 'grad_norm': 0.3237297236919403, 'learning_rate': 9.99952117026961e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.2, 'epoch': 0.03}
3%|β–ˆβ– | 7/232 [15:34<6:58:49, 111.69s/it] 3%|β–ˆβ–Œ | 8/232 [17:19<6:49:00, 109.56s/it] {'loss': 2.5747, 'grad_norm': 0.43742334842681885, 'learning_rate': 9.998084772789603e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.04, 'epoch': 0.03}
3%|β–ˆβ–Œ | 8/232 [17:19<6:49:00, 109.56s/it] 4%|β–ˆβ–Š | 9/232 [19:03<6:40:56, 107.88s/it] {'loss': 2.6133, 'grad_norm': 0.3281920254230499, 'learning_rate': 9.995691082675908e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 502.32, 'epoch': 0.04}
4%|β–ˆβ–Š | 9/232 [19:03<6:40:56, 107.88s/it] 4%|β–ˆβ–‰ | 10/232 [20:49<6:36:36, 107.19s/it] {'loss': 2.4171, 'grad_norm': 0.3485400676727295, 'learning_rate': 9.99234055839652e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 557.48, 'epoch': 0.04}
4%|β–ˆβ–‰ | 10/232 [20:49<6:36:36, 107.19s/it] 5%|β–ˆβ–ˆβ– | 11/232 [22:33<6:31:21, 106.25s/it] {'loss': 2.5725, 'grad_norm': 0.337157666683197, 'learning_rate': 9.988033841683694e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 588.5, 'epoch': 0.05}
5%|β–ˆβ–ˆβ– | 11/232 [22:33<6:31:21, 106.25s/it] 5%|β–ˆβ–ˆβ–Ž | 12/232 [24:18<6:27:53, 105.79s/it] {'loss': 2.563, 'grad_norm': 0.3561720848083496, 'learning_rate': 9.982771757411032e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.06, 'epoch': 0.05}
5%|β–ˆβ–ˆβ–Ž | 12/232 [24:18<6:27:53, 105.79s/it] 6%|β–ˆβ–ˆβ–Œ | 13/232 [26:02<6:24:17, 105.28s/it] {'loss': 2.4225, 'grad_norm': 0.3262440264225006, 'learning_rate': 9.97655531343549e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 591.93, 'epoch': 0.06}
6%|β–ˆβ–ˆβ–Œ | 13/232 [26:02<6:24:17, 105.28s/it] 6%|β–ˆβ–ˆβ–‹ | 14/232 [27:47<6:22:07, 105.17s/it] {'loss': 2.4692, 'grad_norm': 0.3001886308193207, 'learning_rate': 9.969385700404346e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.72, 'epoch': 0.06}
6%|β–ˆβ–ˆβ–‹ | 14/232 [27:47<6:22:07, 105.17s/it] 6%|β–ˆβ–ˆβ–‰ | 15/232 [29:32<6:20:09, 105.11s/it] {'loss': 2.4818, 'grad_norm': 0.26973697543144226, 'learning_rate': 9.96126429152715e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.28, 'epoch': 0.06}
6%|β–ˆβ–ˆβ–‰ | 15/232 [29:32<6:20:09, 105.11s/it] 7%|β–ˆβ–ˆβ–ˆ | 16/232 [31:16<6:17:40, 104.91s/it] {'loss': 2.7253, 'grad_norm': 0.279231458902359, 'learning_rate': 9.952192642312713e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.53, 'epoch': 0.07}
7%|β–ˆβ–ˆβ–ˆ | 16/232 [31:16<6:17:40, 104.91s/it] 7%|β–ˆβ–ˆβ–ˆβ–Ž | 17/232 [33:01<6:15:58, 104.92s/it] {'loss': 2.5233, 'grad_norm': 0.2751440405845642, 'learning_rate': 9.942172490271169e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.47, 'epoch': 0.07}
7%|β–ˆβ–ˆβ–ˆβ–Ž | 17/232 [33:01<6:15:58, 104.92s/it] 8%|β–ˆβ–ˆβ–ˆβ– | 18/232 [34:46<6:13:59, 104.86s/it] {'loss': 2.3993, 'grad_norm': 0.2261095941066742, 'learning_rate': 9.931205754581203e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 590.3, 'epoch': 0.08}
8%|β–ˆβ–ˆβ–ˆβ– | 18/232 [34:46<6:13:59, 104.86s/it] 8%|β–ˆβ–ˆβ–ˆβ–‹ | 19/232 [36:30<6:11:38, 104.69s/it] {'loss': 2.451, 'grad_norm': 0.2214576005935669, 'learning_rate': 9.919294535722452e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.38, 'epoch': 0.08}
8%|β–ˆβ–ˆβ–ˆβ–‹ | 19/232 [36:30<6:11:38, 104.69s/it] 9%|β–ˆβ–ˆβ–ˆβ–‰ | 20/232 [38:15<6:10:28, 104.85s/it] {'loss': 2.4476, 'grad_norm': 0.22921393811702728, 'learning_rate': 9.9064411150732e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.55, 'epoch': 0.09}
9%|β–ˆβ–ˆβ–ˆβ–‰ | 20/232 [38:15<6:10:28, 104.85s/it] 9%|β–ˆβ–ˆβ–ˆβ–ˆ | 21/232 [40:00<6:08:21, 104.74s/it] {'loss': 2.5764, 'grad_norm': 0.2488836944103241, 'learning_rate': 9.892647954473425e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.2, 'epoch': 0.09}
9%|β–ˆβ–ˆβ–ˆβ–ˆ | 21/232 [40:00<6:08:21, 104.74s/it] 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 22/232 [41:44<6:06:26, 104.70s/it] {'loss': 2.4462, 'grad_norm': 0.21848219633102417, 'learning_rate': 9.877917695753275e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 547.83, 'epoch': 0.09}
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 22/232 [41:44<6:06:26, 104.70s/it] 10%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 23/232 [43:29<6:04:18, 104.59s/it] {'loss': 2.4739, 'grad_norm': 0.21689197421073914, 'learning_rate': 9.862253160227077e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.87, 'epoch': 0.1}
10%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 23/232 [43:29<6:04:18, 104.59s/it] 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 24/232 [45:14<6:03:49, 104.95s/it] {'loss': 2.5701, 'grad_norm': 0.24574564397335052, 'learning_rate': 9.845657348152958e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.54, 'epoch': 0.1}
10%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 24/232 [45:14<6:03:49, 104.95s/it][2025-10-07 12:36:33,154] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 12:36:35,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0339405536651611
[2025-10-07 12:36:36,484] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0328030586242676
[2025-10-07 12:36:37,528] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0433502197265625
[2025-10-07 12:36:38,568] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.039651870727539
[2025-10-07 12:36:38,569] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.75s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.65s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.22s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.55s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.74s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:57, 7.86s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.01s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.05s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.07s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:29<01:28, 8.09s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.17s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.16s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.15s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.89s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:47, 7.99s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.10s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:42<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.17s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.18s/it]
{'eval_loss': 2.468449831008911, 'eval_runtime': 187.9758, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.1}
10%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 24/232 [48:28<6:03:49, 104.95s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.18s/it]
[2025-10-07 12:39:46,575] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 12:39:57,444] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-24
11%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 25/232 [50:39<9:48:59, 170.72s/it] {'loss': 2.5522, 'grad_norm': 0.2543272078037262, 'learning_rate': 9.828133438158206e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 588.57, 'epoch': 0.11}
11%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 25/232 [50:39<9:48:59, 170.72s/it] 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 26/232 [52:24<8:38:51, 151.12s/it] {'loss': 2.3874, 'grad_norm': 0.2061944603919983, 'learning_rate': 9.809684786630462e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 575.63, 'epoch': 0.11}
11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 26/232 [52:24<8:38:51, 151.12s/it] 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 27/232 [54:09<7:49:00, 137.27s/it] {'loss': 2.4434, 'grad_norm': 0.21916723251342773, 'learning_rate': 9.79031492707486e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.46, 'epoch': 0.12}
12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 27/232 [54:09<7:49:00, 137.27s/it] 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 28/232 [55:53<7:13:08, 127.40s/it] {'loss': 2.4526, 'grad_norm': 0.21004539728164673, 'learning_rate': 9.770027569437252e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.07, 'epoch': 0.12}
12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 28/232 [55:53<7:13:08, 127.40s/it] 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/232 [57:38<6:48:02, 120.61s/it] {'loss': 2.462, 'grad_norm': 0.21658752858638763, 'learning_rate': 9.748826599393632e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 541.42, 'epoch': 0.12}
12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/232 [57:38<6:48:02, 120.61s/it] 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 30/232 [59:24<6:31:02, 116.15s/it] {'loss': 2.3799, 'grad_norm': 0.20760858058929443, 'learning_rate': 9.72671607760591e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 601.63, 'epoch': 0.13}
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 30/232 [59:24<6:31:02, 116.15s/it] 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 31/232 [1:01:08<6:17:07, 112.57s/it] {'loss': 2.384, 'grad_norm': 0.19674667716026306, 'learning_rate': 9.703700238944157e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 533.46, 'epoch': 0.13}
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 31/232 [1:01:08<6:17:07, 112.57s/it] 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 32/232 [1:02:53<6:07:29, 110.25s/it] {'loss': 2.5231, 'grad_norm': 0.17808720469474792, 'learning_rate': 9.679783491675507e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 609.74, 'epoch': 0.14}
14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 32/232 [1:02:53<6:07:29, 110.25s/it] 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 33/232 [1:04:37<5:59:49, 108.49s/it] {'loss': 2.4602, 'grad_norm': 0.20862546563148499, 'learning_rate': 9.654970416619814e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.45, 'epoch': 0.14}
14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 33/232 [1:04:37<5:59:49, 108.49s/it] 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 34/232 [1:06:22<5:54:24, 107.40s/it] {'loss': 2.5308, 'grad_norm': 0.19285555183887482, 'learning_rate': 9.629265766272293e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 589.77, 'epoch': 0.15}
15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 34/232 [1:06:22<5:54:24, 107.40s/it] 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 35/232 [1:08:07<5:50:22, 106.72s/it] {'loss': 2.4742, 'grad_norm': 0.2334047555923462, 'learning_rate': 9.602674463893266e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.45, 'epoch': 0.15}
15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 35/232 [1:08:07<5:50:22, 106.72s/it] 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 36/232 [1:09:52<5:46:15, 106.00s/it] {'loss': 2.6276, 'grad_norm': 0.2094365656375885, 'learning_rate': 9.575201602565192e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.7, 'epoch': 0.15}
16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 36/232 [1:09:52<5:46:15, 106.00s/it] 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 37/232 [1:11:36<5:43:15, 105.62s/it] {'loss': 2.3691, 'grad_norm': 0.1700724959373474, 'learning_rate': 9.54685244421718e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 573.17, 'epoch': 0.16}
16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 37/232 [1:11:36<5:43:15, 105.62s/it] 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 38/232 [1:13:21<5:40:47, 105.40s/it] {'loss': 2.5592, 'grad_norm': 0.3765801787376404, 'learning_rate': 9.517632418617173e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.1, 'epoch': 0.16}
16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 38/232 [1:13:21<5:40:47, 105.40s/it] 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 39/232 [1:15:05<5:37:51, 105.04s/it] {'loss': 2.3773, 'grad_norm': 0.2009992152452469, 'learning_rate': 9.487547122331965e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 566.78, 'epoch': 0.17}
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 39/232 [1:15:05<5:37:51, 105.04s/it] 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 40/232 [1:16:51<5:36:50, 105.26s/it] {'loss': 2.4833, 'grad_norm': 0.22405098378658295, 'learning_rate': 9.456602317655274e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 605.79, 'epoch': 0.17}
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 40/232 [1:16:51<5:36:50, 105.26s/it] 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 41/232 [1:18:36<5:34:21, 105.03s/it] {'loss': 2.4914, 'grad_norm': 0.18948987126350403, 'learning_rate': 9.424803931504095e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.18, 'epoch': 0.18}
18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 41/232 [1:18:36<5:34:21, 105.03s/it] 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 42/232 [1:20:21<5:32:32, 105.02s/it] {'loss': 2.4371, 'grad_norm': 0.16337323188781738, 'learning_rate': 9.392158054283497e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.47, 'epoch': 0.18}
18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 42/232 [1:20:21<5:32:32, 105.02s/it] 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 43/232 [1:22:05<5:30:04, 104.78s/it] {'loss': 2.4772, 'grad_norm': 0.17447726428508759, 'learning_rate': 9.358670938720114e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 628.39, 'epoch': 0.18}
19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 43/232 [1:22:05<5:30:04, 104.78s/it] 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 44/232 [1:23:49<5:27:50, 104.63s/it] {'loss': 2.4562, 'grad_norm': 0.1712619960308075, 'learning_rate': 9.32434899866455e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 557.26, 'epoch': 0.19}
19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 44/232 [1:23:49<5:27:50, 104.63s/it] 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 45/232 [1:25:34<5:26:24, 104.73s/it] {'loss': 2.4868, 'grad_norm': 0.1861104965209961, 'learning_rate': 9.289198807862929e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.39, 'epoch': 0.19}
19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 45/232 [1:25:34<5:26:24, 104.73s/it] 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 46/232 [1:27:19<5:24:25, 104.65s/it] {'loss': 2.6399, 'grad_norm': 0.19723129272460938, 'learning_rate': 9.253227098697804e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.28, 'epoch': 0.2}
20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 46/232 [1:27:19<5:24:25, 104.65s/it] 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 47/232 [1:29:04<5:23:09, 104.81s/it] {'loss': 2.5103, 'grad_norm': 0.18864890933036804, 'learning_rate': 9.216440760898695e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.2, 'epoch': 0.2}
20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 47/232 [1:29:04<5:23:09, 104.81s/it] 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 48/232 [1:30:50<5:22:25, 105.14s/it] {'loss': 2.4005, 'grad_norm': 0.1855887770652771, 'learning_rate': 9.178846840222489e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.83, 'epoch': 0.21}
21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 48/232 [1:30:50<5:22:25, 105.14s/it][2025-10-07 13:22:08,383] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 13:22:10,654] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0294299125671387
[2025-10-07 13:22:11,684] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0294687747955322
[2025-10-07 13:22:12,715] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0308668613433838
[2025-10-07 13:22:13,749] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.033970594406128
[2025-10-07 13:22:13,751] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.75s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.66s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.22s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.55s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.74s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:57, 7.86s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.01s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.05s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:29<01:29, 8.10s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.17s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.16s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.89s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:47, 8.00s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.10s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:42<00:16, 8.16s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.17s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
{'eval_loss': 2.440826654434204, 'eval_runtime': 188.032, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.21}
21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 48/232 [1:34:03<5:22:25, 105.14s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
[2025-10-07 13:25:21,811] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 13:25:32,526] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-48
21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 49/232 [1:36:14<8:41:09, 170.87s/it] {'loss': 2.3372, 'grad_norm': 0.15217913687229156, 'learning_rate': 9.140452537103943e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 628.86, 'epoch': 0.21}
21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 49/232 [1:36:14<8:41:09, 170.87s/it] 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 50/232 [1:38:00<7:39:21, 151.44s/it] {'loss': 2.4017, 'grad_norm': 0.15943582355976105, 'learning_rate': 9.101265205276581e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 563.54, 'epoch': 0.22}
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 50/232 [1:38:00<7:39:21, 151.44s/it] 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 51/232 [1:39:45<6:54:29, 137.40s/it] {'loss': 2.3504, 'grad_norm': 0.21394406259059906, 'learning_rate': 9.061292350364222e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.33, 'epoch': 0.22}
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 51/232 [1:39:45<6:54:29, 137.40s/it] 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 52/232 [1:41:30<6:23:05, 127.69s/it] {'loss': 2.3518, 'grad_norm': 0.16352114081382751, 'learning_rate': 9.020541628443395e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.89, 'epoch': 0.22}
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 52/232 [1:41:30<6:23:05, 127.69s/it] 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 53/232 [1:43:14<5:59:58, 120.66s/it] {'loss': 2.4803, 'grad_norm': 0.24781003594398499, 'learning_rate': 8.979020844576982e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 573.01, 'epoch': 0.23}
23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 53/232 [1:43:14<5:59:58, 120.66s/it] 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 54/232 [1:44:59<5:43:55, 115.93s/it] {'loss': 2.2747, 'grad_norm': 0.1854228526353836, 'learning_rate': 8.936737951319276e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.86, 'epoch': 0.23}
23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 54/232 [1:44:59<5:43:55, 115.93s/it] 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 55/232 [1:46:44<5:32:07, 112.59s/it] {'loss': 2.4225, 'grad_norm': 0.22357748448848724, 'learning_rate': 8.893701047192832e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.98, 'epoch': 0.24}
24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 55/232 [1:46:44<5:32:07, 112.59s/it] 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 56/232 [1:48:28<5:23:05, 110.14s/it] {'loss': 2.2967, 'grad_norm': 0.14629872143268585, 'learning_rate': 8.84991837513733e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 627.51, 'epoch': 0.24}
24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 56/232 [1:48:28<5:23:05, 110.14s/it] 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 57/232 [1:50:13<5:16:41, 108.58s/it] {'loss': 2.4355, 'grad_norm': 0.18156284093856812, 'learning_rate': 8.805398320930792e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.09, 'epoch': 0.25}
25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 57/232 [1:50:13<5:16:41, 108.58s/it] 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 58/232 [1:51:58<5:11:49, 107.53s/it] {'loss': 2.4022, 'grad_norm': 0.21640124917030334, 'learning_rate': 8.760149411583436e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.78, 'epoch': 0.25}
25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 58/232 [1:51:58<5:11:49, 107.53s/it] 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 59/232 [1:53:43<5:07:28, 106.64s/it] {'loss': 2.5751, 'grad_norm': 0.19623495638370514, 'learning_rate': 8.71418031370449e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.73, 'epoch': 0.25}
25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 59/232 [1:53:43<5:07:28, 106.64s/it] 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 60/232 [1:55:29<5:05:06, 106.43s/it] {'loss': 2.3643, 'grad_norm': 0.16625775396823883, 'learning_rate': 8.667499831842252e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 618.64, 'epoch': 0.26}
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 60/232 [1:55:29<5:05:06, 106.43s/it] 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 61/232 [1:57:13<5:01:41, 105.86s/it] {'loss': 2.5068, 'grad_norm': 0.16208383440971375, 'learning_rate': 8.62011690679774e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 596.28, 'epoch': 0.26}
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 61/232 [1:57:13<5:01:41, 105.86s/it] 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 62/232 [1:58:58<4:59:07, 105.57s/it] {'loss': 2.5253, 'grad_norm': 0.15697439014911652, 'learning_rate': 8.572040613912241e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 492.93, 'epoch': 0.27}
27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 62/232 [1:58:58<4:59:07, 105.57s/it] 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 63/232 [2:00:43<4:56:30, 105.27s/it] {'loss': 2.554, 'grad_norm': 0.2069503366947174, 'learning_rate': 8.5232801613291e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.83, 'epoch': 0.27}
27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 63/232 [2:00:43<4:56:30, 105.27s/it] 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 64/232 [2:02:27<4:54:27, 105.16s/it] {'loss': 2.4899, 'grad_norm': 0.1628636121749878, 'learning_rate': 8.473844888230065e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 565.15, 'epoch': 0.28}
28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 64/232 [2:02:27<4:54:27, 105.16s/it] 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 65/232 [2:04:12<4:52:23, 105.05s/it] {'loss': 2.4041, 'grad_norm': 0.16927234828472137, 'learning_rate': 8.42374426304653e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 585.84, 'epoch': 0.28}
28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 65/232 [2:04:12<4:52:23, 105.05s/it] 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 66/232 [2:05:55<4:48:48, 104.39s/it] {'loss': 2.5481, 'grad_norm': 0.16745679080486298, 'learning_rate': 8.372987881646036e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 630.04, 'epoch': 0.28}
28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 66/232 [2:05:55<4:48:48, 104.39s/it] 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 67/232 [2:07:40<4:47:28, 104.53s/it] {'loss': 2.3923, 'grad_norm': 0.15913288295269012, 'learning_rate': 8.32158546549435e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 605.21, 'epoch': 0.29}
29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 67/232 [2:07:40<4:47:28, 104.53s/it] 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 68/232 [2:09:25<4:45:52, 104.59s/it] {'loss': 2.5484, 'grad_norm': 0.16244389116764069, 'learning_rate': 8.269546859793499e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 475.22, 'epoch': 0.29}
29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 68/232 [2:09:25<4:45:52, 104.59s/it] 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 69/232 [2:11:09<4:43:56, 104.52s/it] {'loss': 2.3085, 'grad_norm': 0.14511074125766754, 'learning_rate': 8.216882031596098e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.67, 'epoch': 0.3}
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 69/232 [2:11:09<4:43:56, 104.52s/it] 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 70/232 [2:12:55<4:43:37, 105.04s/it] {'loss': 2.5241, 'grad_norm': 0.16853027045726776, 'learning_rate': 8.163601067896344e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.74, 'epoch': 0.3}
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 70/232 [2:12:55<4:43:37, 105.04s/it] 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 71/232 [2:14:40<4:41:28, 104.90s/it] {'loss': 2.3546, 'grad_norm': 0.13500231504440308, 'learning_rate': 8.109714173698027e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.86, 'epoch': 0.31}
31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 71/232 [2:14:40<4:41:28, 104.90s/it] 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 72/232 [2:16:26<4:40:27, 105.17s/it] {'loss': 2.358, 'grad_norm': 0.1682259440422058, 'learning_rate': 8.055231670059958e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.33, 'epoch': 0.31}
31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 72/232 [2:16:26<4:40:27, 105.17s/it][2025-10-07 14:07:44,423] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 14:07:46,706] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0272562503814697
[2025-10-07 14:07:47,758] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0513060092926025
[2025-10-07 14:07:48,796] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.038541555404663
[2025-10-07 14:07:49,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0161314010620117
[2025-10-07 14:07:49,814] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.07s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.76s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.67s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.23s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.75s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.87s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.05s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:29<01:29, 8.09s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.17s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.16s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.89s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.00s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:42<00:16, 8.18s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
{'eval_loss': 2.4302256107330322, 'eval_runtime': 188.137, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.31}
31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 72/232 [2:19:39<4:40:27, 105.17s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
[2025-10-07 14:10:57,960] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 14:11:08,752] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-72
31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 73/232 [2:21:50<7:32:42, 170.84s/it] {'loss': 2.5516, 'grad_norm': 0.37493956089019775, 'learning_rate': 8.000163992119146e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 539.21, 'epoch': 0.31}
31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 73/232 [2:21:50<7:32:42, 170.84s/it] 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 74/232 [2:23:35<6:38:13, 151.22s/it] {'loss': 2.3886, 'grad_norm': 0.15932448208332062, 'learning_rate': 7.944521687092143e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 627.95, 'epoch': 0.32}
32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 74/232 [2:23:35<6:38:13, 151.22s/it] 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 75/232 [2:25:20<5:59:32, 137.40s/it] {'loss': 2.5004, 'grad_norm': 0.1597600132226944, 'learning_rate': 7.888315412254921e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.22, 'epoch': 0.32}
32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 75/232 [2:25:20<5:59:32, 137.40s/it] 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 76/232 [2:27:02<5:29:12, 126.62s/it] {'loss': 2.4302, 'grad_norm': 0.15421240031719208, 'learning_rate': 7.831555932901642e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.04, 'epoch': 0.33}
33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 76/232 [2:27:02<5:29:12, 126.62s/it] 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 77/232 [2:28:47<5:10:30, 120.20s/it] {'loss': 2.7413, 'grad_norm': 0.18825848400592804, 'learning_rate': 7.774254120282792e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.97, 'epoch': 0.33}
33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 77/232 [2:28:47<5:10:30, 120.20s/it] 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 78/232 [2:30:32<4:56:50, 115.65s/it] {'loss': 2.4924, 'grad_norm': 0.18012621998786926, 'learning_rate': 7.71642094952296e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 537.21, 'epoch': 0.34}
34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 78/232 [2:30:32<4:56:50, 115.65s/it] 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 79/232 [2:32:16<4:46:13, 112.24s/it] {'loss': 2.3216, 'grad_norm': 0.15518777072429657, 'learning_rate': 7.658067497518773e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 442.25, 'epoch': 0.34}
34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 79/232 [2:32:16<4:46:13, 112.24s/it] 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 80/232 [2:34:02<4:39:28, 110.32s/it] {'loss': 2.4516, 'grad_norm': 0.1714549958705902, 'learning_rate': 7.599204940817309e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 482.12, 'epoch': 0.34}
34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 80/232 [2:34:02<4:39:28, 110.32s/it] 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 81/232 [2:35:47<4:33:26, 108.65s/it] {'loss': 2.4457, 'grad_norm': 0.1730002611875534, 'learning_rate': 7.539844553475427e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.62, 'epoch': 0.35}
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 81/232 [2:35:47<4:33:26, 108.65s/it] 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 82/232 [2:37:32<4:28:49, 107.53s/it] {'loss': 2.3308, 'grad_norm': 0.14245469868183136, 'learning_rate': 7.479997704900437e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 556.71, 'epoch': 0.35}
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 82/232 [2:37:32<4:28:49, 107.53s/it] 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 83/232 [2:39:16<4:24:40, 106.58s/it] {'loss': 2.3798, 'grad_norm': 0.15552735328674316, 'learning_rate': 7.4196758576724835e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 569.3, 'epoch': 0.36}
36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 83/232 [2:39:16<4:24:40, 106.58s/it] 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 84/232 [2:41:01<4:21:54, 106.18s/it] {'loss': 2.5172, 'grad_norm': 0.1897697001695633, 'learning_rate': 7.358890565349106e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.77, 'epoch': 0.36}
36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 84/232 [2:41:01<4:21:54, 106.18s/it] 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 85/232 [2:42:47<4:19:20, 105.85s/it] {'loss': 2.4283, 'grad_norm': 0.17907385528087616, 'learning_rate': 7.297653470252359e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.1, 'epoch': 0.37}
37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 85/232 [2:42:47<4:19:20, 105.85s/it] 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 86/232 [2:44:31<4:16:38, 105.47s/it] {'loss': 2.44, 'grad_norm': 0.16645929217338562, 'learning_rate': 7.235976301238933e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.75, 'epoch': 0.37}
37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 86/232 [2:44:31<4:16:38, 105.47s/it] 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 87/232 [2:46:16<4:14:33, 105.33s/it] {'loss': 2.3995, 'grad_norm': 0.18987996876239777, 'learning_rate': 7.1738708714537165e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.08, 'epoch': 0.37}
38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 87/232 [2:46:16<4:14:33, 105.33s/it] 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 88/232 [2:48:01<4:12:24, 105.17s/it] {'loss': 2.4353, 'grad_norm': 0.3808911442756653, 'learning_rate': 7.111349076067186e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 521.64, 'epoch': 0.38}
38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 88/232 [2:48:01<4:12:24, 105.17s/it] 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 89/232 [2:49:45<4:10:07, 104.95s/it] {'loss': 2.5919, 'grad_norm': 0.1509799361228943, 'learning_rate': 7.048422889997115e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 545.32, 'epoch': 0.38}
38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 89/232 [2:49:45<4:10:07, 104.95s/it] 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 90/232 [2:51:31<4:08:59, 105.21s/it] {'loss': 2.483, 'grad_norm': 0.15931715071201324, 'learning_rate': 6.985104365614987e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 606.25, 'epoch': 0.39}
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 90/232 [2:51:31<4:08:59, 105.21s/it] 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 91/232 [2:53:16<4:06:52, 105.05s/it] {'loss': 2.4388, 'grad_norm': 0.1680772304534912, 'learning_rate': 6.921405630437585e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.99, 'epoch': 0.39}
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 91/232 [2:53:16<4:06:52, 105.05s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 92/232 [2:55:01<4:05:06, 105.04s/it] {'loss': 2.4395, 'grad_norm': 0.1529005765914917, 'learning_rate': 6.857338884804185e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 528.13, 'epoch': 0.4}
40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 92/232 [2:55:01<4:05:06, 105.04s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 93/232 [2:56:42<4:00:21, 103.75s/it] {'loss': 2.3016, 'grad_norm': 0.27351638674736023, 'learning_rate': 6.792916399539805e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 477.83, 'epoch': 0.4}
40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 93/232 [2:56:42<4:00:21, 103.75s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 94/232 [2:58:27<3:59:37, 104.18s/it] {'loss': 2.3479, 'grad_norm': 0.1746247112751007, 'learning_rate': 6.728150513604942e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 551.33, 'epoch': 0.4}
41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 94/232 [2:58:27<3:59:37, 104.18s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 95/232 [3:00:09<3:56:41, 103.66s/it] {'loss': 2.477, 'grad_norm': 0.16394348442554474, 'learning_rate': 6.663053631732279e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 631.79, 'epoch': 0.41}
41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 95/232 [3:00:09<3:56:41, 103.66s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 96/232 [3:01:54<3:55:53, 104.07s/it] {'loss': 2.2869, 'grad_norm': 0.1481611430644989, 'learning_rate': 6.597638222050773e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 390.53, 'epoch': 0.41}
41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 96/232 [3:01:54<3:55:53, 104.07s/it][2025-10-07 14:53:13,026] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 14:53:15,597] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1568572521209717
[2025-10-07 14:53:16,769] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1719346046447754
[2025-10-07 14:53:17,955] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.185570478439331
[2025-10-07 14:53:19,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1617960929870605
[2025-10-07 14:53:19,118] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.75s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.66s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.23s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.74s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.87s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.05s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:29<01:29, 8.10s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.17s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.16s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:47, 8.00s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:42<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
{'eval_loss': 2.423957347869873, 'eval_runtime': 188.1087, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.41}
41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 96/232 [3:05:08<3:55:53, 104.07s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
[2025-10-07 14:56:27,236] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 14:56:37,557] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-96
42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 97/232 [3:07:19<6:23:07, 170.28s/it] {'loss': 2.3912, 'grad_norm': 0.15088680386543274, 'learning_rate': 6.5319168136976155e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 615.67, 'epoch': 0.42}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 97/232 [3:07:19<6:23:07, 170.28s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 98/232 [3:09:04<5:36:40, 150.75s/it] {'loss': 2.3082, 'grad_norm': 0.15012118220329285, 'learning_rate': 6.465901994418505e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 574.1, 'epoch': 0.42}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 98/232 [3:09:04<5:36:40, 150.75s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 99/232 [3:10:49<5:03:43, 137.02s/it] {'loss': 2.6214, 'grad_norm': 0.17665165662765503, 'learning_rate': 6.399606408156688e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 603.33, 'epoch': 0.43}
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 99/232 [3:10:49<5:03:43, 137.02s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 100/232 [3:12:35<4:40:31, 127.51s/it] {'loss': 2.3682, 'grad_norm': 0.1719701886177063, 'learning_rate': 6.333042752631243e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 583.49, 'epoch': 0.43}
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 100/232 [3:12:35<4:40:31, 127.51s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 101/232 [3:14:19<4:23:28, 120.68s/it] {'loss': 2.5312, 'grad_norm': 0.15609215199947357, 'learning_rate': 6.266223776905062e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.84, 'epoch': 0.43}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 101/232 [3:14:19<4:23:28, 120.68s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 102/232 [3:16:04<4:11:21, 116.01s/it] {'loss': 2.3856, 'grad_norm': 0.18730811774730682, 'learning_rate': 6.199162278942997e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.95, 'epoch': 0.44}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 102/232 [3:16:04<4:11:21, 116.01s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 103/232 [3:17:49<4:02:00, 112.56s/it] {'loss': 2.563, 'grad_norm': 0.1721767634153366, 'learning_rate': 6.131871103160644e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.79, 'epoch': 0.44}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 103/232 [3:17:49<4:02:00, 112.56s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 104/232 [3:19:34<3:55:23, 110.34s/it] {'loss': 2.3592, 'grad_norm': 0.20805968344211578, 'learning_rate': 6.064363137964225e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.17, 'epoch': 0.45}
45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 104/232 [3:19:34<3:55:23, 110.34s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 105/232 [3:21:19<3:50:13, 108.77s/it] {'loss': 2.4238, 'grad_norm': 0.16840516030788422, 'learning_rate': 5.996651313282051e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.25, 'epoch': 0.45}
45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 105/232 [3:21:19<3:50:13, 108.77s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 106/232 [3:23:04<3:45:50, 107.55s/it] {'loss': 2.3565, 'grad_norm': 0.17043235898017883, 'learning_rate': 5.9287485980880245e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.98, 'epoch': 0.46}
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 106/232 [3:23:04<3:45:50, 107.55s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 107/232 [3:24:49<3:42:28, 106.79s/it] {'loss': 2.2732, 'grad_norm': 0.15569466352462769, 'learning_rate': 5.860667997917668e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.04, 'epoch': 0.46}
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 107/232 [3:24:49<3:42:28, 106.79s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 108/232 [3:26:34<3:39:40, 106.30s/it] {'loss': 2.4234, 'grad_norm': 0.1824769228696823, 'learning_rate': 5.792422552377153e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.93, 'epoch': 0.46}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 108/232 [3:26:34<3:39:40, 106.30s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 109/232 [3:28:18<3:36:43, 105.72s/it] {'loss': 2.4736, 'grad_norm': 0.17908412218093872, 'learning_rate': 5.724025332645794e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.4, 'epoch': 0.47}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 109/232 [3:28:18<3:36:43, 105.72s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 110/232 [3:29:59<3:32:02, 104.28s/it] {'loss': 2.3608, 'grad_norm': 0.14559771120548248, 'learning_rate': 5.655489438972503e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 584.43, 'epoch': 0.47}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 110/232 [3:29:59<3:32:02, 104.28s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 111/232 [3:31:44<3:30:26, 104.35s/it] {'loss': 2.3306, 'grad_norm': 0.14917968213558197, 'learning_rate': 5.586827998166678e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 552.58, 'epoch': 0.48}
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 111/232 [3:31:44<3:30:26, 104.35s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 112/232 [3:33:29<3:29:00, 104.51s/it] {'loss': 2.432, 'grad_norm': 0.20721513032913208, 'learning_rate': 5.518054161083994e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 515.93, 'epoch': 0.48}
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 112/232 [3:33:29<3:29:00, 104.51s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 113/232 [3:35:13<3:27:21, 104.55s/it] {'loss': 2.4911, 'grad_norm': 0.3023054897785187, 'learning_rate': 5.449181100107599e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.31, 'epoch': 0.49}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 113/232 [3:35:13<3:27:21, 104.55s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 114/232 [3:36:58<3:25:54, 104.70s/it] {'loss': 2.5054, 'grad_norm': 0.1722680777311325, 'learning_rate': 5.38022200662518e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 532.56, 'epoch': 0.49}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 114/232 [3:36:58<3:25:54, 104.70s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 115/232 [3:38:41<3:22:52, 104.03s/it] {'loss': 2.4839, 'grad_norm': 0.16150489449501038, 'learning_rate': 5.31119008850239e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 513.61, 'epoch': 0.49}
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 115/232 [3:38:41<3:22:52, 104.03s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 116/232 [3:40:25<3:21:20, 104.14s/it] {'loss': 2.3893, 'grad_norm': 0.22127696871757507, 'learning_rate': 5.242098567553133e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 588.46, 'epoch': 0.5}
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 116/232 [3:40:25<3:21:20, 104.14s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 117/232 [3:42:10<3:20:05, 104.40s/it] {'loss': 2.2837, 'grad_norm': 0.15573346614837646, 'learning_rate': 5.1729606770071395e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 592.59, 'epoch': 0.5}
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 117/232 [3:42:10<3:20:05, 104.40s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 118/232 [3:43:55<3:18:43, 104.59s/it] {'loss': 2.5217, 'grad_norm': 0.16544535756111145, 'learning_rate': 5.103789658975413e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 578.58, 'epoch': 0.51}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 118/232 [3:43:55<3:18:43, 104.59s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 119/232 [3:45:40<3:17:00, 104.60s/it] {'loss': 2.3913, 'grad_norm': 0.16304516792297363, 'learning_rate': 5.034598761913917e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.38, 'epoch': 0.51}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 119/232 [3:45:40<3:17:00, 104.60s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 120/232 [3:47:26<3:15:54, 104.95s/it] {'loss': 2.4939, 'grad_norm': 0.1516706794500351, 'learning_rate': 4.965401238086084e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 587.79, 'epoch': 0.52}
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 120/232 [3:47:26<3:15:54, 104.95s/it][2025-10-07 15:38:44,450] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 15:38:46,763] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0435214042663574
[2025-10-07 15:38:47,796] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.032888650894165
[2025-10-07 15:38:48,835] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0386509895324707
[2025-10-07 15:38:49,875] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0395262241363525
[2025-10-07 15:38:49,876] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.07s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.76s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.67s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.24s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.75s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.87s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.06s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:30<01:29, 8.10s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.17s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.16s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.00s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:43<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
{'eval_loss': 2.4198384284973145, 'eval_runtime': 188.1898, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.52}
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 120/232 [3:50:39<3:15:54, 104.95s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
[2025-10-07 15:41:58,075] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 15:42:08,817] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-120
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 121/232 [3:52:49<5:15:34, 170.58s/it] {'loss': 2.4623, 'grad_norm': 0.1619013249874115, 'learning_rate': 4.896210341024587e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 627.33, 'epoch': 0.52}
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 121/232 [3:52:49<5:15:34, 170.58s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 122/232 [3:54:35<4:36:58, 151.08s/it] {'loss': 2.409, 'grad_norm': 0.18715393543243408, 'learning_rate': 4.827039322992861e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 584.4, 'epoch': 0.52}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 122/232 [3:54:35<4:36:58, 151.08s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 123/232 [3:56:20<4:09:29, 137.33s/it] {'loss': 2.3789, 'grad_norm': 0.16886143386363983, 'learning_rate': 4.75790143244687e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 548.91, 'epoch': 0.53}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 123/232 [3:56:20<4:09:29, 137.33s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 124/232 [3:58:02<3:47:52, 126.60s/it] {'loss': 2.5186, 'grad_norm': 0.1661851704120636, 'learning_rate': 4.68880991149761e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 630.15, 'epoch': 0.53}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 124/232 [3:58:02<3:47:52, 126.60s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 125/232 [3:59:47<3:34:17, 120.16s/it] {'loss': 2.4673, 'grad_norm': 0.29293861985206604, 'learning_rate': 4.6197779933748226e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.33, 'epoch': 0.54}
54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 125/232 [3:59:47<3:34:17, 120.16s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 126/232 [4:01:32<3:24:26, 115.72s/it] {'loss': 2.5636, 'grad_norm': 0.17951923608779907, 'learning_rate': 4.550818899892402e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.02, 'epoch': 0.54}
54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 126/232 [4:01:32<3:24:26, 115.72s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 127/232 [4:03:17<3:16:45, 112.43s/it] {'loss': 2.4493, 'grad_norm': 0.19470256567001343, 'learning_rate': 4.481945838916006e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.59, 'epoch': 0.55}
55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 127/232 [4:03:17<3:16:45, 112.43s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 128/232 [4:05:02<3:11:05, 110.25s/it] {'loss': 2.3728, 'grad_norm': 0.16387903690338135, 'learning_rate': 4.413172001833324e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.37, 'epoch': 0.55}
55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 128/232 [4:05:02<3:11:05, 110.25s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 129/232 [4:06:47<3:06:41, 108.75s/it] {'loss': 2.2864, 'grad_norm': 0.16799062490463257, 'learning_rate': 4.344510561027498e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.67, 'epoch': 0.55}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 129/232 [4:06:47<3:06:41, 108.75s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 130/232 [4:08:33<3:03:14, 107.79s/it] {'loss': 2.4157, 'grad_norm': 0.14640620350837708, 'learning_rate': 4.275974667354208e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 572.96, 'epoch': 0.56}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 130/232 [4:08:33<3:03:14, 107.79s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 131/232 [4:10:18<3:00:03, 106.96s/it] {'loss': 2.5456, 'grad_norm': 0.21564586460590363, 'learning_rate': 4.207577447622849e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 624.04, 'epoch': 0.56}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 131/232 [4:10:18<3:00:03, 106.96s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 132/232 [4:12:03<2:57:22, 106.42s/it] {'loss': 2.5254, 'grad_norm': 0.1728494018316269, 'learning_rate': 4.139332002082333e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 575.86, 'epoch': 0.57}
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 132/232 [4:12:03<2:57:22, 106.42s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 133/232 [4:13:48<2:54:42, 105.88s/it] {'loss': 2.3662, 'grad_norm': 0.23733289539813995, 'learning_rate': 4.071251401911977e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.44, 'epoch': 0.57}
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 133/232 [4:13:48<2:54:42, 105.88s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 134/232 [4:15:33<2:52:36, 105.68s/it] {'loss': 2.3741, 'grad_norm': 0.16287827491760254, 'learning_rate': 4.00334868671795e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 549.45, 'epoch': 0.58}
58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 134/232 [4:15:33<2:52:36, 105.68s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 135/232 [4:17:18<2:50:34, 105.51s/it] {'loss': 2.463, 'grad_norm': 0.19709622859954834, 'learning_rate': 3.935636862035776e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 539.3, 'epoch': 0.58}
58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 135/232 [4:17:18<2:50:34, 105.51s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 136/232 [4:19:03<2:48:16, 105.17s/it] {'loss': 2.3745, 'grad_norm': 0.1827632039785385, 'learning_rate': 3.868128896839357e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 597.28, 'epoch': 0.58}
59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 136/232 [4:19:03<2:48:16, 105.17s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 137/232 [4:20:48<2:46:31, 105.17s/it] {'loss': 2.4479, 'grad_norm': 0.19809529185295105, 'learning_rate': 3.8008377210570045e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 593.94, 'epoch': 0.59}
59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 137/232 [4:20:48<2:46:31, 105.17s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 138/232 [4:22:33<2:44:49, 105.20s/it] {'loss': 2.4662, 'grad_norm': 0.17152650654315948, 'learning_rate': 3.7337762230949397e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.54, 'epoch': 0.59}
59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 138/232 [4:22:33<2:44:49, 105.20s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 139/232 [4:24:18<2:42:50, 105.06s/it] {'loss': 2.3467, 'grad_norm': 0.16035908460617065, 'learning_rate': 3.6669572473687577e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.85, 'epoch': 0.6}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 139/232 [4:24:18<2:42:50, 105.06s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 140/232 [4:26:04<2:41:31, 105.34s/it] {'loss': 2.6583, 'grad_norm': 0.1990923434495926, 'learning_rate': 3.6003935918433124e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.49, 'epoch': 0.6}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 140/232 [4:26:04<2:41:31, 105.34s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 141/232 [4:27:48<2:39:28, 105.15s/it] {'loss': 2.4977, 'grad_norm': 0.17929832637310028, 'learning_rate': 3.534098005581497e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 615.64, 'epoch': 0.61}
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 141/232 [4:27:48<2:39:28, 105.15s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 142/232 [4:29:33<2:37:40, 105.12s/it] {'loss': 2.604, 'grad_norm': 0.16450382769107819, 'learning_rate': 3.4680831863023866e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.82, 'epoch': 0.61}
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 142/232 [4:29:33<2:37:40, 105.12s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 143/232 [4:31:18<2:35:44, 104.99s/it] {'loss': 2.3583, 'grad_norm': 0.15677917003631592, 'learning_rate': 3.402361777949229e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 598.1, 'epoch': 0.62}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 143/232 [4:31:18<2:35:44, 104.99s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 144/232 [4:33:04<2:34:22, 105.26s/it] {'loss': 2.6741, 'grad_norm': 0.26103436946868896, 'learning_rate': 3.336946368267724e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 505.75, 'epoch': 0.62}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 144/232 [4:33:04<2:34:22, 105.26s/it][2025-10-07 16:24:22,771] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 16:24:25,081] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0497405529022217
[2025-10-07 16:24:26,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.035264253616333
[2025-10-07 16:24:27,150] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0325191020965576
[2025-10-07 16:24:28,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0278091430664062
[2025-10-07 16:24:28,179] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.76s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.66s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.23s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.75s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.87s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.05s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:36, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:30<01:29, 8.10s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.18s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.17s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.00s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.10s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:43<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
{'eval_loss': 2.417459726333618, 'eval_runtime': 188.1589, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.62}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 144/232 [4:36:18<2:34:22, 105.26s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.19s/it]
[2025-10-07 16:27:36,365] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 16:27:47,153] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-144
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 145/232 [4:38:29<4:08:02, 171.07s/it] {'loss': 2.4198, 'grad_norm': 0.15687525272369385, 'learning_rate': 3.271849486395059e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 544.63, 'epoch': 0.62}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 145/232 [4:38:29<4:08:02, 171.07s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 146/232 [4:40:14<3:36:59, 151.39s/it] {'loss': 2.4322, 'grad_norm': 0.14793400466442108, 'learning_rate': 3.207083600460196e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 582.54, 'epoch': 0.63}
63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 146/232 [4:40:14<3:36:59, 151.39s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 147/232 [4:41:59<3:14:53, 137.57s/it] {'loss': 2.4873, 'grad_norm': 0.1773616373538971, 'learning_rate': 3.1426611151958146e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.19, 'epoch': 0.63}
63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 147/232 [4:41:59<3:14:53, 137.57s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 148/232 [4:43:44<2:58:45, 127.69s/it] {'loss': 2.4418, 'grad_norm': 0.1769264191389084, 'learning_rate': 3.078594369562417e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.38, 'epoch': 0.64}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 148/232 [4:43:44<2:58:45, 127.69s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 149/232 [4:45:29<2:47:15, 120.91s/it] {'loss': 2.4109, 'grad_norm': 0.1511746048927307, 'learning_rate': 3.0148956343850143e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 585.43, 'epoch': 0.64}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 149/232 [4:45:29<2:47:15, 120.91s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 150/232 [4:47:15<2:39:12, 116.50s/it] {'loss': 2.4526, 'grad_norm': 0.1719832420349121, 'learning_rate': 2.9515771100028854e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 617.13, 'epoch': 0.65}
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 150/232 [4:47:15<2:39:12, 116.50s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 151/232 [4:49:00<2:32:29, 112.95s/it] {'loss': 2.4441, 'grad_norm': 0.1828099489212036, 'learning_rate': 2.888650923932815e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.57, 'epoch': 0.65}
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 151/232 [4:49:00<2:32:29, 112.95s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 152/232 [4:50:45<2:27:29, 110.62s/it] {'loss': 2.277, 'grad_norm': 0.15714910626411438, 'learning_rate': 2.8261291285462843e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.21, 'epoch': 0.65}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 152/232 [4:50:45<2:27:29, 110.62s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 153/232 [4:52:27<2:21:59, 107.84s/it] {'loss': 2.4279, 'grad_norm': 0.15972191095352173, 'learning_rate': 2.7640236987610662e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 510.77, 'epoch': 0.66}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 153/232 [4:52:27<2:21:59, 107.84s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 154/232 [4:54:10<2:18:34, 106.60s/it] {'loss': 2.4685, 'grad_norm': 0.16657793521881104, 'learning_rate': 2.7023465297476426e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.24, 'epoch': 0.66}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 154/232 [4:54:10<2:18:34, 106.60s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 155/232 [4:55:56<2:16:17, 106.21s/it] {'loss': 2.4661, 'grad_norm': 0.19078649580478668, 'learning_rate': 2.641109434650894e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.4, 'epoch': 0.67}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 155/232 [4:55:56<2:16:17, 106.21s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 156/232 [4:57:39<2:13:34, 105.46s/it] {'loss': 2.4035, 'grad_norm': 0.20265750586986542, 'learning_rate': 2.580324142327516e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 535.76, 'epoch': 0.67}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 156/232 [4:57:39<2:13:34, 105.46s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 157/232 [4:59:23<2:11:14, 105.00s/it] {'loss': 2.3609, 'grad_norm': 0.18236300349235535, 'learning_rate': 2.520002295099564e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 629.39, 'epoch': 0.68}
68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 157/232 [4:59:23<2:11:14, 105.00s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 158/232 [5:01:09<2:09:40, 105.14s/it] {'loss': 2.6265, 'grad_norm': 0.18648919463157654, 'learning_rate': 2.460155446524573e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.42, 'epoch': 0.68}
68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 158/232 [5:01:09<2:09:40, 105.14s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 159/232 [5:02:53<2:07:48, 105.05s/it] {'loss': 2.5449, 'grad_norm': 0.8310457468032837, 'learning_rate': 2.400795059182692e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.21, 'epoch': 0.68}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 159/232 [5:02:54<2:07:48, 105.05s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 160/232 [5:04:37<2:05:36, 104.67s/it] {'loss': 2.3449, 'grad_norm': 0.15494287014007568, 'learning_rate': 2.341932502481226e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 537.9, 'epoch': 0.69}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 160/232 [5:04:37<2:05:36, 104.67s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 161/232 [5:06:19<2:02:57, 103.91s/it] {'loss': 2.3852, 'grad_norm': 0.34528088569641113, 'learning_rate': 2.283579050477042e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 608.59, 'epoch': 0.69}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 161/232 [5:06:19<2:02:57, 103.91s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 162/232 [5:08:04<2:01:32, 104.18s/it] {'loss': 2.41, 'grad_norm': 0.18780948221683502, 'learning_rate': 2.2257458797172093e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 470.71, 'epoch': 0.7}
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 162/232 [5:08:04<2:01:32, 104.18s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 163/232 [5:09:49<1:59:52, 104.23s/it] {'loss': 2.449, 'grad_norm': 0.18119558691978455, 'learning_rate': 2.1684440670983568e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 598.12, 'epoch': 0.7}
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 163/232 [5:09:49<1:59:52, 104.23s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 164/232 [5:11:34<1:58:23, 104.46s/it] {'loss': 2.3244, 'grad_norm': 0.1482125073671341, 'learning_rate': 2.111684587745081e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 533.73, 'epoch': 0.71}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 164/232 [5:11:34<1:58:23, 104.46s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 165/232 [5:13:17<1:56:27, 104.29s/it] {'loss': 2.4997, 'grad_norm': 0.17195050418376923, 'learning_rate': 2.0554783129078564e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 482.3, 'epoch': 0.71}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 165/232 [5:13:17<1:56:27, 104.29s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 166/232 [5:15:02<1:54:46, 104.35s/it] {'loss': 2.5106, 'grad_norm': 0.15829245746135712, 'learning_rate': 1.9998360078808547e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 544.61, 'epoch': 0.71}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 166/232 [5:15:02<1:54:46, 104.35s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 167/232 [5:16:47<1:53:19, 104.60s/it] {'loss': 2.6374, 'grad_norm': 0.29079005122184753, 'learning_rate': 1.944768329940045e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 611.46, 'epoch': 0.72}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 167/232 [5:16:47<1:53:19, 104.60s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 168/232 [5:18:33<1:52:00, 105.01s/it] {'loss': 2.3114, 'grad_norm': 0.14395098388195038, 'learning_rate': 1.8902858263019746e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 561.31, 'epoch': 0.72}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 168/232 [5:18:33<1:52:00, 105.01s/it][2025-10-07 17:09:51,859] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 17:09:54,453] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1820712089538574
[2025-10-07 17:09:55,648] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1941492557525635
[2025-10-07 17:09:56,840] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1920030117034912
[2025-10-07 17:09:58,008] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.1677920818328857
[2025-10-07 17:09:58,009] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.08s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.78s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.68s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.25s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:41<02:08, 7.57s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:04, 7.76s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.88s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.03s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.07s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:22<01:37, 8.10s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:30<01:29, 8.11s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.18s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.17s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.17s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:02<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.01s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:43<00:16, 8.18s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.20s/it]
{'eval_loss': 2.4160118103027344, 'eval_runtime': 188.376, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.196, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.72}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 168/232 [5:21:48<1:52:00, 105.01s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [03:00<00:00, 8.20s/it]
[2025-10-07 17:13:06,394] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 17:13:16,663] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-168
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 169/232 [5:23:58<2:59:27, 170.91s/it] {'loss': 2.4697, 'grad_norm': 0.1664450615644455, 'learning_rate': 1.836398932103658e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 541.01, 'epoch': 0.73}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 169/232 [5:23:58<2:59:27, 170.91s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 170/232 [5:25:44<2:36:41, 151.64s/it] {'loss': 2.4647, 'grad_norm': 0.16507600247859955, 'learning_rate': 1.7831179684039041e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 620.34, 'epoch': 0.73}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 170/232 [5:25:44<2:36:41, 151.64s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 171/232 [5:27:25<2:18:38, 136.38s/it] {'loss': 2.5393, 'grad_norm': 0.2299947887659073, 'learning_rate': 1.7304531402065033e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 554.48, 'epoch': 0.74}
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 171/232 [5:27:25<2:18:38, 136.38s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 172/232 [5:29:10<2:06:59, 126.99s/it] {'loss': 2.3934, 'grad_norm': 0.26125118136405945, 'learning_rate': 1.6784145345056519e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 454.37, 'epoch': 0.74}
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 172/232 [5:29:10<2:06:59, 126.99s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 173/232 [5:30:53<1:57:44, 119.74s/it] {'loss': 2.4524, 'grad_norm': 0.25597989559173584, 'learning_rate': 1.627012118353965e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 552.18, 'epoch': 0.74}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 173/232 [5:30:53<1:57:44, 119.74s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 174/232 [5:32:38<1:51:31, 115.37s/it] {'loss': 2.3758, 'grad_norm': 0.15529395639896393, 'learning_rate': 1.5762557369534709e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 571.66, 'epoch': 0.75}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 174/232 [5:32:38<1:51:31, 115.37s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 175/232 [5:34:24<1:46:52, 112.49s/it] {'loss': 2.5284, 'grad_norm': 0.19255177676677704, 'learning_rate': 1.5261551117699358e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 619.51, 'epoch': 0.75}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 175/232 [5:34:24<1:46:52, 112.49s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 176/232 [5:36:09<1:42:49, 110.16s/it] {'loss': 2.4243, 'grad_norm': 0.29165372252464294, 'learning_rate': 1.4767198386708998e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.81, 'epoch': 0.76}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 176/232 [5:36:09<1:42:49, 110.16s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 177/232 [5:37:54<1:39:43, 108.79s/it] {'loss': 2.652, 'grad_norm': 0.17469967901706696, 'learning_rate': 1.427959386087761e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 620.67, 'epoch': 0.76}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 177/232 [5:37:54<1:39:43, 108.79s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 178/232 [5:39:40<1:36:58, 107.75s/it] {'loss': 2.3821, 'grad_norm': 0.16631750762462616, 'learning_rate': 1.3798830932022616e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 559.11, 'epoch': 0.77}
77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 178/232 [5:39:40<1:36:58, 107.75s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 179/232 [5:41:24<1:34:20, 106.80s/it] {'loss': 2.4935, 'grad_norm': 0.1531817764043808, 'learning_rate': 1.3325001681577482e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 572.08, 'epoch': 0.77}
77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 179/232 [5:41:24<1:34:20, 106.80s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 180/232 [5:43:11<1:32:24, 106.63s/it] {'loss': 2.4493, 'grad_norm': 0.1834908127784729, 'learning_rate': 1.2858196862955108e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.93, 'epoch': 0.77}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 180/232 [5:43:11<1:32:24, 106.63s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 181/232 [5:44:55<1:30:09, 106.07s/it] {'loss': 2.3978, 'grad_norm': 0.1634717881679535, 'learning_rate': 1.2398505884165652e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.54, 'epoch': 0.78}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 181/232 [5:44:55<1:30:09, 106.07s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 182/232 [5:46:40<1:28:09, 105.80s/it] {'loss': 2.5803, 'grad_norm': 0.1714055985212326, 'learning_rate': 1.1946016790692094e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 599.66, 'epoch': 0.78}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 182/232 [5:46:40<1:28:09, 105.80s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 183/232 [5:48:25<1:26:03, 105.38s/it] {'loss': 2.3313, 'grad_norm': 0.16218791902065277, 'learning_rate': 1.1500816248626711e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 493.83, 'epoch': 0.79}
79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 183/232 [5:48:25<1:26:03, 105.38s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 184/232 [5:50:10<1:24:16, 105.33s/it] {'loss': 2.3898, 'grad_norm': 0.1729874461889267, 'learning_rate': 1.1062989528071683e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.82, 'epoch': 0.79}
79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 184/232 [5:50:10<1:24:16, 105.33s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 185/232 [5:51:56<1:22:32, 105.37s/it] {'loss': 2.3343, 'grad_norm': 0.1679660528898239, 'learning_rate': 1.0632620486807244e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.54, 'epoch': 0.8}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 185/232 [5:51:56<1:22:32, 105.37s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 186/232 [5:53:40<1:20:35, 105.13s/it] {'loss': 2.309, 'grad_norm': 0.16071046888828278, 'learning_rate': 1.0209791554230209e-06, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 498.13, 'epoch': 0.8}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 186/232 [5:53:40<1:20:35, 105.13s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 187/232 [5:55:25<1:18:52, 105.18s/it] {'loss': 2.4983, 'grad_norm': 0.16588424146175385, 'learning_rate': 9.79458371556607e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 581.82, 'epoch': 0.8}
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 187/232 [5:55:25<1:18:52, 105.18s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 188/232 [5:57:11<1:17:10, 105.25s/it] {'loss': 2.3184, 'grad_norm': 0.23015336692333221, 'learning_rate': 9.387076496357805e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.74, 'epoch': 0.81}
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 188/232 [5:57:11<1:17:10, 105.25s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 189/232 [5:58:55<1:15:16, 105.04s/it] {'loss': 2.379, 'grad_norm': 0.1979931890964508, 'learning_rate': 8.987347947234193e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.2, 'epoch': 0.81}
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 189/232 [5:58:55<1:15:16, 105.04s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 190/232 [6:00:41<1:13:41, 105.28s/it] {'loss': 2.3405, 'grad_norm': 0.15547919273376465, 'learning_rate': 8.595474628960598e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 593.02, 'epoch': 0.82}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 190/232 [6:00:41<1:13:41, 105.28s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 191/232 [6:02:26<1:11:46, 105.04s/it] {'loss': 2.4418, 'grad_norm': 0.1689731627702713, 'learning_rate': 8.211531597775136e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 577.69, 'epoch': 0.82}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 191/232 [6:02:26<1:11:46, 105.04s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 192/232 [6:04:11<1:10:10, 105.27s/it] {'loss': 2.3292, 'grad_norm': 0.19969233870506287, 'learning_rate': 7.835592391013053e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 389.12, 'epoch': 0.83}
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 192/232 [6:04:11<1:10:10, 105.27s/it][2025-10-07 17:55:30,257] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 17:55:32,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0582835674285889
[2025-10-07 17:55:33,637] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0531456470489502
[2025-10-07 17:55:34,689] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0518488883972168
[2025-10-07 17:55:35,735] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0457401275634766
[2025-10-07 17:55:35,736] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.76s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.66s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.23s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.75s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.88s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.06s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:37, 8.08s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:30<01:29, 8.10s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.18s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.17s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.01s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.07s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:43<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.20s/it]
{'eval_loss': 2.4152867794036865, 'eval_runtime': 188.2145, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.83}
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 192/232 [6:07:25<1:10:10, 105.27s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.20s/it]
[2025-10-07 17:58:43,960] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 17:58:54,750] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-192
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 193/232 [6:09:32<1:50:27, 169.92s/it] {'loss': 2.3347, 'grad_norm': 0.16426128149032593, 'learning_rate': 7.467729013021979e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 554.67, 'epoch': 0.83}
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 193/232 [6:09:32<1:50:27, 169.92s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 194/232 [6:11:15<1:34:52, 149.80s/it] {'loss': 2.3597, 'grad_norm': 0.16122335195541382, 'learning_rate': 7.108011921370728e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 606.77, 'epoch': 0.83}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 194/232 [6:11:15<1:34:52, 149.80s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 195/232 [6:13:00<1:24:07, 136.42s/it] {'loss': 2.3711, 'grad_norm': 0.1594630926847458, 'learning_rate': 6.756510013354512e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 544.71, 'epoch': 0.84}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 195/232 [6:13:00<1:24:07, 136.42s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 196/232 [6:14:45<1:16:08, 126.89s/it] {'loss': 2.4303, 'grad_norm': 0.17642569541931152, 'learning_rate': 6.413290612798883e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.32, 'epoch': 0.84}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 196/232 [6:14:45<1:16:08, 126.89s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 197/232 [6:16:30<1:10:11, 120.34s/it] {'loss': 2.2769, 'grad_norm': 0.14454026520252228, 'learning_rate': 6.078419457165036e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 574.95, 'epoch': 0.85}
85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 197/232 [6:16:30<1:10:11, 120.34s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 198/232 [6:18:15<1:05:37, 115.82s/it] {'loss': 2.3751, 'grad_norm': 0.23137526214122772, 'learning_rate': 5.751960684959046e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 556.73, 'epoch': 0.85}
85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 198/232 [6:18:15<1:05:37, 115.82s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 199/232 [6:20:00<1:01:52, 112.50s/it] {'loss': 2.5033, 'grad_norm': 0.1744750589132309, 'learning_rate': 5.433976823447262e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.73, 'epoch': 0.86}
86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 199/232 [6:20:00<1:01:52, 112.50s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 200/232 [6:21:46<58:59, 110.62s/it] {'loss': 2.5263, 'grad_norm': 0.1891147941350937, 'learning_rate': 5.124528776680371e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.83, 'epoch': 0.86}
86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 200/232 [6:21:46<58:59, 110.62s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 201/232 [6:23:31<56:15, 108.89s/it] {'loss': 2.405, 'grad_norm': 0.17183446884155273, 'learning_rate': 4.823675813828271e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 625.1, 'epoch': 0.86}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 201/232 [6:23:31<56:15, 108.89s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 202/232 [6:25:16<53:52, 107.76s/it] {'loss': 2.3735, 'grad_norm': 0.15647412836551666, 'learning_rate': 4.531475557828202e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.38, 'epoch': 0.87}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 202/232 [6:25:16<53:52, 107.76s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 203/232 [6:27:01<51:38, 106.83s/it] {'loss': 2.371, 'grad_norm': 0.1775280237197876, 'learning_rate': 4.2479839743480965e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.13, 'epoch': 0.87}
88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 203/232 [6:27:01<51:38, 106.83s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 204/232 [6:28:46<49:36, 106.31s/it] {'loss': 2.4249, 'grad_norm': 0.1871228665113449, 'learning_rate': 3.9732553610673465e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 576.87, 'epoch': 0.88}
88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 204/232 [6:28:46<49:36, 106.31s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 205/232 [6:30:31<47:40, 105.93s/it] {'loss': 2.4219, 'grad_norm': 0.18398220837116241, 'learning_rate': 3.7073423372770754e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 555.12, 'epoch': 0.88}
88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 205/232 [6:30:31<47:40, 105.93s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 206/232 [6:32:16<45:44, 105.54s/it] {'loss': 2.3863, 'grad_norm': 0.14451023936271667, 'learning_rate': 3.4502958338018754e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.47, 'epoch': 0.89}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 206/232 [6:32:16<45:44, 105.54s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 207/232 [6:34:01<43:55, 105.42s/it] {'loss': 2.4147, 'grad_norm': 0.15192757546901703, 'learning_rate': 3.20216508324494e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 616.78, 'epoch': 0.89}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 207/232 [6:34:01<43:55, 105.42s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 208/232 [6:35:46<42:08, 105.34s/it] {'loss': 2.303, 'grad_norm': 0.1576743721961975, 'learning_rate': 2.9629976105584266e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.21, 'epoch': 0.89}
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 208/232 [6:35:46<42:08, 105.34s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 209/232 [6:37:31<40:18, 105.14s/it] {'loss': 2.49, 'grad_norm': 0.16963180899620056, 'learning_rate': 2.732839223940914e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 594.11, 'epoch': 0.9}
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 209/232 [6:37:31<40:18, 105.14s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 210/232 [6:39:17<38:38, 105.40s/it] {'loss': 2.44, 'grad_norm': 0.1585075557231903, 'learning_rate': 2.5117340060636817e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 591.26, 'epoch': 0.9}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 210/232 [6:39:17<38:38, 105.40s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 211/232 [6:41:01<36:49, 105.19s/it] {'loss': 2.4358, 'grad_norm': 0.16591255366802216, 'learning_rate': 2.2997243056274822e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 626.0, 'epoch': 0.91}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 211/232 [6:41:01<36:49, 105.19s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 212/232 [6:42:46<35:03, 105.16s/it] {'loss': 2.2981, 'grad_norm': 0.1462378203868866, 'learning_rate': 2.096850729251404e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 623.6, 'epoch': 0.91}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 212/232 [6:42:47<35:03, 105.16s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 213/232 [6:44:32<33:17, 105.15s/it] {'loss': 2.3811, 'grad_norm': 0.1623452752828598, 'learning_rate': 1.903152133695385e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 579.91, 'epoch': 0.92}
92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 213/232 [6:44:32<33:17, 105.15s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 214/232 [6:46:17<31:31, 105.09s/it] {'loss': 2.397, 'grad_norm': 0.15678995847702026, 'learning_rate': 1.7186656184179475e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 560.78, 'epoch': 0.92}
92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 214/232 [6:46:17<31:31, 105.09s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 215/232 [6:48:02<29:47, 105.15s/it] {'loss': 2.4259, 'grad_norm': 0.20976859331130981, 'learning_rate': 1.543426518470431e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.46, 'epoch': 0.92}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 215/232 [6:48:02<29:47, 105.15s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 216/232 [6:49:47<28:04, 105.28s/it] {'loss': 2.5872, 'grad_norm': 0.18247883021831512, 'learning_rate': 1.3774683977292426e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 620.68, 'epoch': 0.93}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 216/232 [6:49:47<28:04, 105.28s/it][2025-10-07 18:41:06,193] [INFO] [axolotl.core.trainers.base.evaluate:376] [PID:8314] Running evaluation step...
[2025-10-07 18:41:08,512] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.050823450088501
[2025-10-07 18:41:09,574] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.061586856842041
[2025-10-07 18:41:10,635] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.060603380203247
[2025-10-07 18:41:11,689] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:8314] generate_batches time: 1.0533573627471924
[2025-10-07 18:41:11,690] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:8314] gather_len_batches: [47, 47]
0%| | 0/23 [00:00<?, ?it/s]
9%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2/23 [00:08<01:25, 4.06s/it]
13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3/23 [00:16<01:55, 5.76s/it]
17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4/23 [00:24<02:06, 6.66s/it]
22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5/23 [00:32<02:10, 7.23s/it]
26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 6/23 [00:40<02:08, 7.56s/it]
30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/23 [00:49<02:03, 7.74s/it]
35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/23 [00:57<01:58, 7.87s/it]
39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 9/23 [01:05<01:52, 8.02s/it]
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 10/23 [01:13<01:44, 8.06s/it]
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/23 [01:21<01:37, 8.09s/it]
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 12/23 [01:30<01:29, 8.11s/it]
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 13/23 [01:38<01:21, 8.18s/it]
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 14/23 [01:46<01:13, 8.17s/it]
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 15/23 [01:54<01:05, 8.16s/it]
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 16/23 [02:01<00:55, 7.90s/it]
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17/23 [02:10<00:48, 8.01s/it]
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 18/23 [02:18<00:40, 8.06s/it]
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 19/23 [02:26<00:32, 8.09s/it]
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 20/23 [02:34<00:24, 8.11s/it]
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 21/23 [02:43<00:16, 8.17s/it]
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/23 [02:51<00:08, 8.18s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.20s/it]
{'eval_loss': 2.4150502681732178, 'eval_runtime': 188.221, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.197, 'memory/max_active (GiB)': 5.33, 'memory/max_allocated (GiB)': 5.32, 'memory/device_reserved (GiB)': 7.1, 'epoch': 0.93}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 216/232 [6:53:01<28:04, 105.28s/it]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23/23 [02:59<00:00, 8.20s/it]
[2025-10-07 18:44:19,920] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 18:44:30,660] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-216
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 217/232 [6:55:11<42:40, 170.68s/it] {'loss': 2.4799, 'grad_norm': 0.2342216670513153, 'learning_rate': 1.2208230424672562e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 458.67, 'epoch': 0.93}
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 217/232 [6:55:11<42:40, 170.68s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 218/232 [6:56:57<35:18, 151.30s/it] {'loss': 2.4609, 'grad_norm': 0.20292286574840546, 'learning_rate': 1.0735204552657641e-07, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 18.51, 'tokens_per_second_per_gpu': 623.66, 'epoch': 0.94}
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 218/232 [6:56:57<35:18, 151.30s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 219/232 [6:58:42<29:47, 137.53s/it] {'loss': 2.3216, 'grad_norm': 0.18959718942642212, 'learning_rate': 9.355888492680155e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 621.9, 'epoch': 0.94}
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 219/232 [6:58:42<29:47, 137.53s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 220/232 [7:00:28<25:35, 127.94s/it] {'loss': 2.3656, 'grad_norm': 0.15441013872623444, 'learning_rate': 8.070546427754899e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 614.72, 'epoch': 0.95}
95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 220/232 [7:00:28<25:35, 127.94s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 221/232 [7:02:12<22:10, 120.97s/it] {'loss': 2.415, 'grad_norm': 0.15820012986660004, 'learning_rate': 6.879424541879676e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 579.88, 'epoch': 0.95}
95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 221/232 [7:02:12<22:10, 120.97s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 222/232 [7:03:58<19:22, 116.23s/it] {'loss': 2.4204, 'grad_norm': 0.18182291090488434, 'learning_rate': 5.782750972883111e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 614.62, 'epoch': 0.95}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 222/232 [7:03:58<19:22, 116.23s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 223/232 [7:05:42<16:54, 112.77s/it] {'loss': 2.4713, 'grad_norm': 0.16219539940357208, 'learning_rate': 4.780735768728895e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 600.89, 'epoch': 0.96}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 223/232 [7:05:42<16:54, 112.77s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 224/232 [7:07:28<14:44, 110.54s/it] {'loss': 2.3601, 'grad_norm': 0.1545465886592865, 'learning_rate': 3.873570847285013e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.2, 'epoch': 0.96}
97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 224/232 [7:07:28<14:44, 110.54s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 225/232 [7:09:13<12:42, 108.97s/it] {'loss': 2.4025, 'grad_norm': 0.20205241441726685, 'learning_rate': 3.0614299595654875e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.33, 'epoch': 0.97}
97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 225/232 [7:09:13<12:42, 108.97s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 226/232 [7:10:58<10:45, 107.67s/it] {'loss': 2.5119, 'grad_norm': 0.17662324011325836, 'learning_rate': 2.3444686564511042e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 580.82, 'epoch': 0.97}
97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 226/232 [7:10:58<10:45, 107.67s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 227/232 [7:12:43<08:54, 106.96s/it] {'loss': 2.3202, 'grad_norm': 0.18644018471240997, 'learning_rate': 1.7228242588969714e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 622.32, 'epoch': 0.98}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 227/232 [7:12:43<08:54, 106.96s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 228/232 [7:14:28<07:05, 106.44s/it] {'loss': 2.3215, 'grad_norm': 0.14924494922161102, 'learning_rate': 1.1966158316307208e-08, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 595.23, 'epoch': 0.98}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 228/232 [7:14:28<07:05, 106.44s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 229/232 [7:16:13<05:17, 105.91s/it] {'loss': 2.554, 'grad_norm': 0.18786196410655975, 'learning_rate': 7.65944160348142e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 524.39, 'epoch': 0.98}
99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 229/232 [7:16:13<05:17, 105.91s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 230/232 [7:17:59<03:31, 105.97s/it] {'loss': 2.3815, 'grad_norm': 0.16499285399913788, 'learning_rate': 4.308917324092887e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 451.24, 'epoch': 0.99}
99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 230/232 [7:17:59<03:31, 105.97s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 231/232 [7:19:44<01:45, 105.61s/it] {'loss': 2.3755, 'grad_norm': 0.1830398291349411, 'learning_rate': 1.9152272103972746e-09, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 550.24, 'epoch': 0.99}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 231/232 [7:19:44<01:45, 105.61s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 232/232 [7:21:30<00:00, 105.76s/it] {'loss': 2.3878, 'grad_norm': 0.18990111351013184, 'learning_rate': 4.788297303903732e-10, 'memory/max_active (GiB)': 13.85, 'memory/max_allocated (GiB)': 13.84, 'memory/device_reserved (GiB)': 15.88, 'tokens_per_second_per_gpu': 535.43, 'epoch': 1.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 232/232 [7:21:30<00:00, 105.76s/it][2025-10-07 19:12:48,537] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 19:12:57,756] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv/checkpoint-232
{'train_runtime': 26517.4203, 'train_samples_per_second': 0.07, 'train_steps_per_second': 0.009, 'train_loss': 2.4443190539705344, 'memory/max_active (GiB)': 4.12, 'memory/max_allocated (GiB)': 4.12, 'memory/device_reserved (GiB)': 4.18, 'epoch': 1.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 232/232 [7:21:54<00:00, 105.76s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 232/232 [7:21:54<00:00, 114.29s/it]
[2025-10-07 19:16:04,232] [INFO] [axolotl.train.save_trained_model:225] [PID:8314] Training completed! Saving trained model to ckpts-mmarv.
[2025-10-07 19:16:04,250] [WARNING] [py.warnings._showwarnmsg:110] [PID:8314] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
warnings.warn(
[2025-10-07 19:16:13,614] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
[2025-10-07 19:16:25,556] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
Processing Files (0 / 0) : | | 0.00B / 0.00B
New Data Upload : | | 0.00B / 0.00B 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 29%|β–ˆβ–ˆβ–‰ | 134MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 29%|β–ˆβ–ˆβ–‰ | 134MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 32%|β–ˆβ–ˆβ–ˆβ– | 151MB / 473MB, ???B/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 277MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 294MB / 473MB, 711MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 411MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 428MB / 473MB, 691MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (3 / 3) : 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 473MB / 473MB, 537MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (3 / 3) : 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 473MB / 473MB, 201MB/s
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB
[2025-10-07 19:16:31,415] [INFO] [axolotl.train.save_trained_model:346] [PID:8314] Model successfully saved to ckpts-mmarv
[2025-10-07 19:16:41,179] [INFO] [axolotl.core.trainers.base._save:671] [PID:8314] Saving model checkpoint to ckpts-mmarv
Processing Files (0 / 0) : | | 0.00B / 0.00B
New Data Upload : | | 0.00B / 0.00B 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 31%|β–ˆβ–ˆβ–ˆ | 143MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 31%|β–ˆβ–ˆβ–ˆ | 143MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 34%|β–ˆβ–ˆβ–ˆβ–Ž | 160MB / 473MB, ???B/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 277MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 294MB / 473MB, 670MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 419MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (2 / 3) : 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 437MB / 473MB, 692MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (3 / 3) : 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 473MB / 473MB, 523MB/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB 
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB 
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB 
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB  Processing Files (3 / 3) : 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 473MB / 473MB, 224MB/s
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
...ining/ckpts-mmarv/training_args.bin: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7.95kB / 7.95kB
...pts-mmarv/adapter_model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456MB / 456MB
...training/ckpts-mmarv/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17.1MB / 17.1MB