|
|
[2026-01-05 05:56:21,496] [WARNING] [axolotl.utils.trainer.prepare_optim_env:644] [PID:505777] P2P support not detected, setting `NCCL_P2P_DISABLE=1` |
|
|
[2026-01-05 05:56:21,496] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:505777] bf16 support detected, enabling for this configuration. |
|
|
[2026-01-05 05:56:21,499] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:505777] baseline 0.000GB () |
|
|
[2026-01-05 05:56:21,499] [INFO] [axolotl.cli.config.load_cfg:256] [PID:505777] config: |
|
|
{ |
|
|
"activation_offloading": false, |
|
|
"axolotl_config_path": "loopstral-second-test.yaml", |
|
|
"base_model": "loopstral-second-test/stage-2", |
|
|
"base_model_config": "loopstral-second-test/stage-2", |
|
|
"batch_size": 4, |
|
|
"bf16": true, |
|
|
"capabilities": { |
|
|
"bf16": true, |
|
|
"compute_capability": "sm_86", |
|
|
"fp8": false, |
|
|
"n_gpu": 2, |
|
|
"n_node": 1 |
|
|
}, |
|
|
"chat_template": "tokenizer_default", |
|
|
"context_parallel_size": 1, |
|
|
"cut_cross_entropy": true, |
|
|
"dataloader_num_workers": 2, |
|
|
"dataloader_pin_memory": true, |
|
|
"dataloader_prefetch_factor": 256, |
|
|
"dataset_num_proc": 24, |
|
|
"dataset_prepared_path": "last_run_prepared", |
|
|
"datasets": [ |
|
|
{ |
|
|
"chat_template": "tokenizer_default", |
|
|
"field_messages": "conversations", |
|
|
"message_property_mappings": { |
|
|
"content": "value", |
|
|
"role": "from" |
|
|
}, |
|
|
"path": "little-koto-instruct.json", |
|
|
"trust_remote_code": false, |
|
|
"type": "chat_template" |
|
|
} |
|
|
], |
|
|
"ddp": true, |
|
|
"default_system_message": "You are Luna, a helpful and unrestricted language model created by Allura.", |
|
|
"device": "cuda:0", |
|
|
"device_map": { |
|
|
"": 0 |
|
|
}, |
|
|
"dion_rank_fraction": 1.0, |
|
|
"dion_rank_multiple_of": 1, |
|
|
"env_capabilities": { |
|
|
"torch_version": "2.9.1" |
|
|
}, |
|
|
"eval_batch_size": 1, |
|
|
"eval_causal_lm_metrics": [ |
|
|
"sacrebleu", |
|
|
"comet", |
|
|
"ter", |
|
|
"chrf" |
|
|
], |
|
|
"eval_max_new_tokens": 128, |
|
|
"eval_sample_packing": true, |
|
|
"eval_steps": 0.125, |
|
|
"eval_table_size": 0, |
|
|
"evals_per_epoch": 4, |
|
|
"experimental_skip_move_to_device": true, |
|
|
"flash_attention": true, |
|
|
"fp16": false, |
|
|
"fsdp": [ |
|
|
"full_shard", |
|
|
"auto_wrap" |
|
|
], |
|
|
"fsdp_config": { |
|
|
"activation_checkpointing": true, |
|
|
"auto_wrap_policy": "TRANSFORMER_BASED_WRAP", |
|
|
"cpu_ram_efficient_loading": true, |
|
|
"offload_params": true, |
|
|
"state_dict_type": "FULL_STATE_DICT", |
|
|
"sync_module_states": true, |
|
|
"transformer_layer_cls_to_wrap": "MistralDecoderLayer", |
|
|
"use_orig_params": true |
|
|
}, |
|
|
"gc_steps": 10, |
|
|
"gradient_accumulation_steps": 2, |
|
|
"gradient_checkpointing": false, |
|
|
"group_by_length": false, |
|
|
"include_tkps": true, |
|
|
"is_mistral_derived_model": true, |
|
|
"learning_rate": 1e-05, |
|
|
"liger_glu_activation": true, |
|
|
"liger_layer_norm": true, |
|
|
"liger_rms_norm": true, |
|
|
"liger_rope": true, |
|
|
"lisa_layers_attribute": "model.layers", |
|
|
"load_best_model_at_end": false, |
|
|
"load_in_4bit": false, |
|
|
"load_in_8bit": false, |
|
|
"local_rank": 0, |
|
|
"logging_steps": 1, |
|
|
"lora_alpha": 16, |
|
|
"lora_dropout": 0.01, |
|
|
"lora_r": 128, |
|
|
"lora_target_linear": true, |
|
|
"lora_target_modules": [ |
|
|
"up_proj", |
|
|
"down_proj", |
|
|
"gate_proj", |
|
|
"q_proj", |
|
|
"v_proj", |
|
|
"k_proj", |
|
|
"o_proj" |
|
|
], |
|
|
"loraplus_lr_embedding": 1e-06, |
|
|
"lr_scheduler": "cosine", |
|
|
"max_grad_norm": 2.0, |
|
|
"mean_resizing_embeddings": false, |
|
|
"micro_batch_size": 1, |
|
|
"model_config_type": "mistral", |
|
|
"num_epochs": 2.0, |
|
|
"optimizer": "adamw_torch_fused", |
|
|
"otel_metrics_host": "localhost", |
|
|
"otel_metrics_port": 8000, |
|
|
"output_dir": "loopstral-second-test/stage-3-healed", |
|
|
"pad_to_sequence_len": true, |
|
|
"peft_use_rslora": true, |
|
|
"plugins": [ |
|
|
"axolotl.integrations.liger.LigerPlugin", |
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" |
|
|
], |
|
|
"pretrain_multipack_attn": true, |
|
|
"profiler_steps_start": 0, |
|
|
"qlora_sharded_model_loading": false, |
|
|
"ray_num_workers": 1, |
|
|
"resources_per_worker": { |
|
|
"GPU": 1 |
|
|
}, |
|
|
"sample_packing": true, |
|
|
"sample_packing_bin_size": 200, |
|
|
"sample_packing_group_size": 100000, |
|
|
"save_only_model": false, |
|
|
"save_safetensors": true, |
|
|
"save_steps": 0.5, |
|
|
"saves_per_epoch": 1, |
|
|
"seed": 420, |
|
|
"sequence_len": 4096, |
|
|
"shuffle_before_merging_datasets": false, |
|
|
"shuffle_merged_datasets": true, |
|
|
"skip_prepare_dataset": false, |
|
|
"streaming_multipack_buffer_size": 10000, |
|
|
"strict": false, |
|
|
"tensor_parallel_size": 1, |
|
|
"tiled_mlp_use_original_mlp": true, |
|
|
"tokenizer_config": "loopstral-second-test/stage-2", |
|
|
"tokenizer_save_jinja_files": true, |
|
|
"torch_dtype": "torch.bfloat16", |
|
|
"train_on_inputs": false, |
|
|
"trl": { |
|
|
"log_completions": false, |
|
|
"mask_truncated_completions": false, |
|
|
"ref_model_mixup_alpha": 0.9, |
|
|
"ref_model_sync_steps": 64, |
|
|
"scale_rewards": true, |
|
|
"sync_ref_model": false, |
|
|
"use_vllm": false, |
|
|
"vllm_server_host": "0.0.0.0", |
|
|
"vllm_server_port": 8000 |
|
|
}, |
|
|
"trust_remote_code": false, |
|
|
"use_otel_metrics": false, |
|
|
"use_ray": false, |
|
|
"use_wandb": true, |
|
|
"val_set_size": 0.02, |
|
|
"vllm": { |
|
|
"device": "auto", |
|
|
"dtype": "auto", |
|
|
"gpu_memory_utilization": 0.9, |
|
|
"host": "0.0.0.0", |
|
|
"port": 8000 |
|
|
}, |
|
|
"wandb_name": "second-stage-3-healed", |
|
|
"wandb_project": "Loopstral-Tests", |
|
|
"warmup_ratio": 0.025, |
|
|
"weight_decay": 0.001, |
|
|
"world_size": 2 |
|
|
} |
|
|
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s> |
|
|
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s> |
|
|
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad> |
|
|
[2026-01-05 05:56:21,851] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk> |
|
|
[2026-01-05 05:56:27,343] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:505777] Loading prepared dataset from disk at last_run_prepared/7bb3932098dd42f3b946c9e64ba32239... |
|
|
[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 18_837 |
|
|
[2026-01-05 05:56:27,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 13_323` |
|
|
[2026-01-05 05:56:27,355] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:27,916] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:28,173] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.257068395614624 |
|
|
[2026-01-05 05:56:28,174] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25144505500793457 |
|
|
[2026-01-05 05:56:28,425] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25107741355895996 |
|
|
[2026-01-05 05:56:28,676] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:28,927] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25074076652526855 |
|
|
[2026-01-05 05:56:29,428] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
[2026-01-05 05:56:29,485] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 1 |
|
|
[2026-01-05 05:56:29,499] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.7664794921875, 0.9197753667831421] |
|
|
[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: None |
|
|
[2026-01-05 05:56:29,500] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 2 |
|
|
[2026-01-05 05:56:29,505] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:505777] total_num_tokens: 922_178 |
|
|
[2026-01-05 05:56:29,514] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:505777] `total_supervised_tokens: 746_491` |
|
|
[2026-01-05 05:56:29,525] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:29,778] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252016544342041 |
|
|
[2026-01-05 05:56:30,030] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:30,282] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.252286434173584 |
|
|
[2026-01-05 05:56:30,283] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:30,534] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25146055221557617 |
|
|
[2026-01-05 05:56:30,535] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:505777] Using single process for pack_parallel, running sequentially. |
|
|
[2026-01-05 05:56:30,786] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 0.25194621086120605 |
|
|
[2026-01-05 05:56:30,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228] |
|
|
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:505777] data_loader_len: 57 |
|
|
[2026-01-05 05:56:30,803] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:505777] sample_packing_eff_est across ranks: [0.9874610304832458, 0.9874610304832458] |
|
|
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:505777] sample_packing_eff_est: 0.99 |
|
|
[2026-01-05 05:56:30,803] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:505777] total_num_steps: 114 |
|
|
[2026-01-05 05:56:30,804] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:505777] Maximum number of steps set at 114 |
|
|
[2026-01-05 05:56:30,828] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:505777] loading tokenizer... loopstral-second-test/stage-2 |
|
|
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:505777] EOS: 2 / </s> |
|
|
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:505777] BOS: 1 / <s> |
|
|
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:505777] PAD: 11 / <pad> |
|
|
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:505777] UNK: 0 / <unk> |
|
|
[2026-01-05 05:56:30,988] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:505777] Loading model |
|
|
[2026-01-05 05:56:30,994] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:505777] Patched Trainer.evaluation_loop with nanmean loss calculation |
|
|
[2026-01-05 05:56:30,995] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:505777] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation |
|
|
[2026-01-05 05:56:30,995] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:505777] Applying multipack dataloader patch for sample packing... |
|
|
[2026-01-05 05:56:31,073] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:505777] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True} |
|
|
[2026-01-05 05:56:31,145] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:505777] Applying Cut Cross Entropy to model type: mistral |
|
|
Loading checkpoint shards: 0
Loading checkpoint shards: 33
Loading checkpoint shards: 67
Loading checkpoint shards: 100
Loading checkpoint shards: 100 |
|
|
[2026-01-05 05:56:36,035] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:505777] Converting modules to torch.bfloat16 |
|
|
[2026-01-05 05:56:36,037] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:505777] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved) |
|
|
[2026-01-05 05:56:39,795] [INFO] [axolotl.train.save_initial_configs:417] [PID:505777] Pre-saving tokenizer to loopstral-second-test/stage-3-healed... |
|
|
[2026-01-05 05:56:39,829] [INFO] [axolotl.train.save_initial_configs:422] [PID:505777] Pre-saving model config to loopstral-second-test/stage-3-healed... |
|
|
[2026-01-05 05:56:39,831] [INFO] [axolotl.train.execute_training:212] [PID:505777] Starting trainer... |
|
|
[2026-01-05 05:56:42,363] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0816106796264648 |
|
|
[2026-01-05 05:56:43,451] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0880126953125 |
|
|
[2026-01-05 05:56:44,524] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0731561183929443 |
|
|
[2026-01-05 05:56:45,606] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.0811669826507568 |
|
|
[2026-01-05 05:56:45,607] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [228, 228] |
|
|
[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralForCausalLM because mixed precision turned on in FSDP. Affects: model.embed_tokens.weight, model.norm.weight, lm_head.weight. |
|
|
warnings.warn( |
|
|
|
|
|
[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1968: UserWarning: Upcasted low precision parameters in MistralDecoderLayer because mixed precision turned on in FSDP. Affects: self_attn.q_proj.weight, self_attn.k_proj.weight, self_attn.v_proj.weight, self_attn.o_proj.weight, mlp.gate_proj.weight, mlp.up_proj.weight, mlp.down_proj.weight, input_layernorm.weight, post_attention_layernorm.weight. |
|
|
warnings.warn( |
|
|
|
|
|
[2026-01-05 05:56:55,362] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/accelerate/accelerator.py:1974: UserWarning: FSDP upcast of low precision parameters may affect the precision of model checkpoints. |
|
|
warnings.warn( |
|
|
|
|
|
[34m[1mwandb[0m: Currently logged in as: [33mcooawoo[0m ([33mcooawoo-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin |
|
|
[34m[1mwandb[0m: [38;5;178mβ’Ώ[0m Waiting for wandb.init()... |
|
|
[Am[2K
[34m[1mwandb[0m: [38;5;178mβ£»[0m setting up run 90pp12rs (0.2s) |
|
|
[Am[2K
[34m[1mwandb[0m: [38;5;178mβ£½[0m setting up run 90pp12rs (0.2s) |
|
|
[Am[2K
[34m[1mwandb[0m: Tracking run with wandb version 0.23.1 |
|
|
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/aibox/training/wandb/run-20260105_055655-90pp12rs[0m |
|
|
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing. |
|
|
[34m[1mwandb[0m: Syncing run [33msecond-stage-3-healed[0m |
|
|
[34m[1mwandb[0m: βοΈ View project at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests[0m |
|
|
[34m[1mwandb[0m: π View run at [34m[4mhttps://wandb.ai/cooawoo-personal/Loopstral-Tests/runs/90pp12rs[0m |
|
|
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use. |
|
|
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. |
|
|
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/ |
|
|
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") |
|
|
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files. |
|
|
[2026-01-05 05:56:58,375] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:505777] The Axolotl config has been saved to the WandB run under files. |
|
|
0 |
|
|
[2026-01-05 05:57:01,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.6251494884490967 |
|
|
[2026-01-05 05:57:03,393] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.645111322402954 |
|
|
[2026-01-05 05:57:04,955] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5617477893829346 |
|
|
[2026-01-05 05:57:06,522] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.5672385692596436 |
|
|
[2026-01-05 05:57:06,523] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 1.1707839965820312, 'eval_runtime': 35.4697, 'eval_samples_per_second': 0.536, 'eval_steps_per_second': 0.282, 'eval_ppl': 3.2245, 'memory/max_active (GiB)': 3.76, 'memory/max_allocated (GiB)': 3.76, 'memory/device_reserved (GiB)': 9.22, 'epoch': 0} |
|
|
0 |
|
|
100 |
|
|
[A
1
{'loss': 1.2216, 'grad_norm': 19.611356735229492, 'learning_rate': 0.0, 'ppl': 3.3926, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 420.22, 'total_tokens': 23701, 'epoch': 0.02} |
|
|
1
2
{'loss': 1.3721, 'grad_norm': 22.5228214263916, 'learning_rate': 5e-06, 'ppl': 3.9436, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 438.27, 'total_tokens': 36593, 'epoch': 0.04} |
|
|
2
3
{'loss': 1.2242, 'grad_norm': 23.2207088470459, 'learning_rate': 1e-05, 'ppl': 3.4014, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 235.72, 'total_tokens': 47910, 'epoch': 0.05} |
|
|
3
4
{'loss': 1.2708, 'grad_norm': 9.459047317504883, 'learning_rate': 9.998033131915266e-06, 'ppl': 3.5637, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.71, 'total_tokens': 62377, 'epoch': 0.07} |
|
|
4
4
{'loss': 1.0452, 'grad_norm': 8.951719284057617, 'learning_rate': 9.992134075089085e-06, 'ppl': 2.844, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 371.1, 'total_tokens': 74989, 'epoch': 0.09} |
|
|
4
5
{'loss': 1.037, 'grad_norm': 5.003825664520264, 'learning_rate': 9.982307470588097e-06, 'ppl': 2.8207, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.42, 'total_tokens': 88184, 'epoch': 0.11} |
|
|
5
6
{'loss': 1.1807, 'grad_norm': 6.691038131713867, 'learning_rate': 9.968561049466214e-06, 'ppl': 3.2567, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 291.57, 'total_tokens': 100065, 'epoch': 0.12} |
|
|
6
7
{'loss': 1.1553, 'grad_norm': 4.891448974609375, 'learning_rate': 9.950905626682229e-06, 'ppl': 3.175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 305.04, 'total_tokens': 111981, 'epoch': 0.14} |
|
|
7
8
{'loss': 0.9665, 'grad_norm': 4.162895202636719, 'learning_rate': 9.92935509259118e-06, 'ppl': 2.6287, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.6, 'total_tokens': 126309, 'epoch': 0.16} |
|
|
8
9
{'loss': 1.1024, 'grad_norm': 4.0764946937561035, 'learning_rate': 9.903926402016153e-06, 'ppl': 3.0114, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 315.69, 'total_tokens': 138777, 'epoch': 0.18} |
|
|
9
10
{'loss': 0.9937, 'grad_norm': 4.487460613250732, 'learning_rate': 9.874639560909118e-06, 'ppl': 2.7012, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 381.39, 'total_tokens': 152449, 'epoch': 0.19} |
|
|
10
11
{'loss': 1.0321, 'grad_norm': 4.153564453125, 'learning_rate': 9.841517610611309e-06, 'ppl': 2.807, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.38, 'total_tokens': 165644, 'epoch': 0.21} |
|
|
11
11
{'loss': 0.8913, 'grad_norm': 5.131930828094482, 'learning_rate': 9.804586609725499e-06, 'ppl': 2.4383, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.4, 'total_tokens': 177744, 'epoch': 0.23} |
|
|
11
12
{'loss': 0.862, 'grad_norm': 4.371148109436035, 'learning_rate': 9.763875613614482e-06, 'ppl': 2.3679, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 316.1, 'total_tokens': 190738, 'epoch': 0.25} |
|
|
12
13
{'loss': 1.1077, 'grad_norm': 4.898997783660889, 'learning_rate': 9.719416651541839e-06, 'ppl': 3.0274, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.3, 'total_tokens': 203458, 'epoch': 0.26} |
|
|
13 |
|
|
[2026-01-05 06:02:21,514] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.7565350532531738 |
|
|
[2026-01-05 06:02:23,355] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8411920070648193 |
|
|
[2026-01-05 06:02:25,205] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8490314483642578 |
|
|
[2026-01-05 06:02:27,076] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8708629608154297 |
|
|
[2026-01-05 06:02:27,078] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.8464773297309875, 'eval_runtime': 10.0269, 'eval_samples_per_second': 1.895, 'eval_steps_per_second': 0.997, 'eval_ppl': 2.3314, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.26} |
|
|
13 |
|
|
100 |
|
|
[A
14
{'loss': 0.9622, 'grad_norm': 3.9995832443237305, 'learning_rate': 9.671244701472999e-06, 'ppl': 2.6174, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 315.28, 'total_tokens': 227749, 'epoch': 0.28} |
|
|
14
15
{'loss': 1.1259, 'grad_norm': 4.939190864562988, 'learning_rate': 9.619397662556434e-06, 'ppl': 3.083, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 279.61, 'total_tokens': 238481, 'epoch': 0.3} |
|
|
15
16
{'loss': 1.038, 'grad_norm': 4.293745040893555, 'learning_rate': 9.563916325306595e-06, 'ppl': 2.8236, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.17, 'total_tokens': 251682, 'epoch': 0.32} |
|
|
16
17
{'loss': 0.9465, 'grad_norm': 4.432325839996338, 'learning_rate': 9.504844339512096e-06, 'ppl': 2.5767, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 338.71, 'total_tokens': 263764, 'epoch': 0.33} |
|
|
17
18
{'loss': 1.0822, 'grad_norm': 4.204977512359619, 'learning_rate': 9.442228179894362e-06, 'ppl': 2.9512, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 401.66, 'total_tokens': 276958, 'epoch': 0.35} |
|
|
18
18
{'loss': 0.9275, 'grad_norm': 4.079134464263916, 'learning_rate': 9.376117109543769e-06, 'ppl': 2.5282, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 361.9, 'total_tokens': 289687, 'epoch': 0.37} |
|
|
18
19
{'loss': 1.0468, 'grad_norm': 19.045801162719727, 'learning_rate': 9.306563141162046e-06, 'ppl': 2.8485, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 404.94, 'total_tokens': 303849, 'epoch': 0.39} |
|
|
19
20
{'loss': 0.9479, 'grad_norm': 4.227816581726074, 'learning_rate': 9.233620996141421e-06, 'ppl': 2.5803, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 399.93, 'total_tokens': 318222, 'epoch': 0.4} |
|
|
20
21
{'loss': 0.8656, 'grad_norm': 4.401808261871338, 'learning_rate': 9.157348061512728e-06, 'ppl': 2.3764, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 424.57, 'total_tokens': 331525, 'epoch': 0.42} |
|
|
21
22
{'loss': 0.9866, 'grad_norm': 4.104758262634277, 'learning_rate': 9.077804344796302e-06, 'ppl': 2.6821, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 342.8, 'total_tokens': 344242, 'epoch': 0.44} |
|
|
22
23
{'loss': 0.8213, 'grad_norm': 3.642549514770508, 'learning_rate': 8.995052426791247e-06, 'ppl': 2.2735, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 425.35, 'total_tokens': 358434, 'epoch': 0.46} |
|
|
23
24
{'loss': 0.7607, 'grad_norm': 3.46269154548645, 'learning_rate': 8.90915741234015e-06, 'ppl': 2.1398, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.27, 'total_tokens': 373583, 'epoch': 0.47} |
|
|
24
25
{'loss': 0.8874, 'grad_norm': 4.47441291809082, 'learning_rate': 8.820186879108038e-06, 'ppl': 2.4288, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.56, 'total_tokens': 388098, 'epoch': 0.49} |
|
|
25
25
{'loss': 0.9208, 'grad_norm': 7.110525131225586, 'learning_rate': 8.728210824415829e-06, 'ppl': 2.5113, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 378.74, 'total_tokens': 401056, 'epoch': 0.51} |
|
|
25
26
{'loss': 0.9311, 'grad_norm': 4.498164176940918, 'learning_rate': 8.633301610170136e-06, 'ppl': 2.5373, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 382.0, 'total_tokens': 414830, 'epoch': 0.53} |
|
|
26 |
|
|
[2026-01-05 06:07:09,813] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8823564052581787 |
|
|
[2026-01-05 06:07:11,720] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.9062902927398682 |
|
|
[2026-01-05 06:07:13,619] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8988819122314453 |
|
|
[2026-01-05 06:07:15,483] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.863325834274292 |
|
|
[2026-01-05 06:07:15,484] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.8130025267601013, 'eval_runtime': 9.0489, 'eval_samples_per_second': 2.1, 'eval_steps_per_second': 1.105, 'eval_ppl': 2.2547, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.53} |
|
|
26 |
|
|
100 |
|
|
[A
27
{'loss': 0.9568, 'grad_norm': 4.3019700050354, 'learning_rate': 8.535533905932739e-06, 'ppl': 2.6034, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.97, 'total_tokens': 438063, 'epoch': 0.54} |
|
|
27
28
{'loss': 1.4274, 'grad_norm': 13.445784568786621, 'learning_rate': 8.43498463017451e-06, 'ppl': 4.1678, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 411.95, 'total_tokens': 449755, 'epoch': 0.56} |
|
|
28
29
{'loss': 0.9124, 'grad_norm': 4.726632595062256, 'learning_rate': 8.331732889760021e-06, 'ppl': 2.4903, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 430.35, 'total_tokens': 464296, 'epoch': 0.58} |
|
|
29
30
{'loss': 0.9753, 'grad_norm': 4.2374067306518555, 'learning_rate': 8.22585991771044e-06, 'ppl': 2.652, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 386.8, 'total_tokens': 476505, 'epoch': 0.6} |
|
|
30
31
{'loss': 0.9245, 'grad_norm': 3.9087975025177, 'learning_rate': 8.117449009293668e-06, 'ppl': 2.5206, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 364.2, 'total_tokens': 490425, 'epoch': 0.61} |
|
|
31
32
{'loss': 0.889, 'grad_norm': 4.065995216369629, 'learning_rate': 8.00658545649203e-06, 'ppl': 2.4327, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.93, 'total_tokens': 504789, 'epoch': 0.63} |
|
|
32
32
{'loss': 0.8679, 'grad_norm': 4.762394428253174, 'learning_rate': 7.89335648089903e-06, 'ppl': 2.3819, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 403.62, 'total_tokens': 518553, 'epoch': 0.65} |
|
|
32
33
{'loss': 1.0176, 'grad_norm': 15.393413543701172, 'learning_rate': 7.777851165098012e-06, 'ppl': 2.7665, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 294.78, 'total_tokens': 531189, 'epoch': 0.67} |
|
|
33
34
{'loss': 0.9678, 'grad_norm': 3.8640310764312744, 'learning_rate': 7.660160382576683e-06, 'ppl': 2.6321, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.2, 'total_tokens': 544812, 'epoch': 0.68} |
|
|
34
35
{'loss': 1.0927, 'grad_norm': 4.1391167640686035, 'learning_rate': 7.540376726232648e-06, 'ppl': 2.9823, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 353.51, 'total_tokens': 558027, 'epoch': 0.7} |
|
|
35
36
{'loss': 0.8597, 'grad_norm': 3.4433345794677734, 'learning_rate': 7.4185944355261996e-06, 'ppl': 2.3625, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 410.17, 'total_tokens': 572138, 'epoch': 0.72} |
|
|
36
37
{'loss': 0.8931, 'grad_norm': 3.9547741413116455, 'learning_rate': 7.294909322337689e-06, 'ppl': 2.4427, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 374.06, 'total_tokens': 585392, 'epoch': 0.74} |
|
|
37
38
{'loss': 0.9103, 'grad_norm': 7.8218255043029785, 'learning_rate': 7.169418695587791e-06, 'ppl': 2.4851, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.66, 'total_tokens': 599321, 'epoch': 0.75} |
|
|
38
39
{'loss': 0.8742, 'grad_norm': 4.100659370422363, 'learning_rate': 7.042221284679982e-06, 'ppl': 2.397, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 277.15, 'total_tokens': 611687, 'epoch': 0.77} |
|
|
39
39
{'loss': 1.0304, 'grad_norm': 4.246405124664307, 'learning_rate': 6.913417161825449e-06, 'ppl': 2.8022, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 383.16, 'total_tokens': 624725, 'epoch': 0.79} |
|
|
39 |
|
|
[2026-01-05 06:11:54,496] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8704819679260254 |
|
|
[2026-01-05 06:11:56,390] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.893763780593872 |
|
|
[2026-01-05 06:11:58,227] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.836214542388916 |
|
|
[2026-01-05 06:12:00,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 1.8623669147491455 |
|
|
[2026-01-05 06:12:00,091] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.8040304780006409, 'eval_runtime': 9.0899, 'eval_samples_per_second': 2.09, 'eval_steps_per_second': 1.1, 'eval_ppl': 2.2345, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 0.79} |
|
|
39 |
|
|
100 |
|
|
[A
40
{'loss': 0.936, 'grad_norm': 4.026333332061768, 'learning_rate': 6.783107663311566e-06, 'ppl': 2.5498, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 425.2, 'total_tokens': 648077, 'epoch': 0.81} |
|
|
40
41
{'loss': 0.893, 'grad_norm': 3.673527717590332, 'learning_rate': 6.651395309775837e-06, 'ppl': 2.4424, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 394.49, 'total_tokens': 662323, 'epoch': 0.82} |
|
|
41
42
{'loss': 1.0356, 'grad_norm': 6.087688446044922, 'learning_rate': 6.518383725548074e-06, 'ppl': 2.8168, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.91, 'total_tokens': 674394, 'epoch': 0.84} |
|
|
42
43
{'loss': 0.8771, 'grad_norm': 3.8041579723358154, 'learning_rate': 6.384177557124247e-06, 'ppl': 2.4039, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 366.56, 'total_tokens': 687865, 'epoch': 0.86} |
|
|
43
44
{'loss': 0.991, 'grad_norm': 3.9628713130950928, 'learning_rate': 6.248882390836135e-06, 'ppl': 2.6939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 383.67, 'total_tokens': 700828, 'epoch': 0.88} |
|
|
44
45
{'loss': 0.8278, 'grad_norm': 3.998246669769287, 'learning_rate': 6.112604669781572e-06, 'ppl': 2.2883, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 396.65, 'total_tokens': 713732, 'epoch': 0.89} |
|
|
45
46
{'loss': 0.8638, 'grad_norm': 4.033308506011963, 'learning_rate': 5.975451610080643e-06, 'ppl': 2.3722, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 336.52, 'total_tokens': 726692, 'epoch': 0.91} |
|
|
46
46
{'loss': 1.3544, 'grad_norm': 17.30064582824707, 'learning_rate': 5.837531116523683e-06, 'ppl': 3.8744, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 403.9, 'total_tokens': 739602, 'epoch': 0.93} |
|
|
46
47
{'loss': 0.9146, 'grad_norm': 3.651843309402466, 'learning_rate': 5.698951697677498e-06, 'ppl': 2.4958, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 410.43, 'total_tokens': 753378, 'epoch': 0.95} |
|
|
47
48
{'loss': 1.0104, 'grad_norm': 4.368696212768555, 'learning_rate': 5.559822380516539e-06, 'ppl': 2.7467, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 382.87, 'total_tokens': 767289, 'epoch': 0.96} |
|
|
48
49
{'loss': 1.0087, 'grad_norm': 5.310736656188965, 'learning_rate': 5.420252624646238e-06, 'ppl': 2.742, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 311.44, 'total_tokens': 777836, 'epoch': 0.98} |
|
|
49
50
{'loss': 0.8086, 'grad_norm': 4.207986831665039, 'learning_rate': 5.2803522361859596e-06, 'ppl': 2.2448, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 289.66, 'total_tokens': 788519, 'epoch': 1.0} |
|
|
50 |
|
|
warnings.warn( |
|
|
|
|
|
[2026-01-05 06:16:05,005] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-57 |
|
|
51
{'loss': 0.6146, 'grad_norm': 3.4813573360443115, 'learning_rate': 5.140231281379345e-06, 'ppl': 1.8489, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 261.61, 'total_tokens': 801358, 'epoch': 1.02} |
|
|
51
52
{'loss': 0.6649, 'grad_norm': 3.9285855293273926, 'learning_rate': 5e-06, 'ppl': 1.9443, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 407.5, 'total_tokens': 814250, 'epoch': 1.04} |
|
|
52
53
{'loss': 0.6454, 'grad_norm': 3.718013286590576, 'learning_rate': 4.859768718620656e-06, 'ppl': 1.9067, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 227.37, 'total_tokens': 825567, 'epoch': 1.05} |
|
|
53 |
|
|
[2026-01-05 06:19:24,401] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.28102445602417 |
|
|
[2026-01-05 06:19:26,656] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2544238567352295 |
|
|
[2026-01-05 06:19:28,921] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2653965950012207 |
|
|
[2026-01-05 06:19:31,178] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2563204765319824 |
|
|
[2026-01-05 06:19:31,179] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.7972212433815002, 'eval_runtime': 9.5865, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 1.043, 'eval_ppl': 2.2194, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.05} |
|
|
53 |
|
|
100 |
|
|
[A
54
{'loss': 0.6167, 'grad_norm': 5.092803001403809, 'learning_rate': 4.719647763814041e-06, 'ppl': 1.8528, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 398.69, 'total_tokens': 850896, 'epoch': 1.07} |
|
|
54
54
{'loss': 0.4792, 'grad_norm': 5.132570743560791, 'learning_rate': 4.579747375353763e-06, 'ppl': 1.6148, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 378.34, 'total_tokens': 863508, 'epoch': 1.09} |
|
|
54
55
{'loss': 0.4102, 'grad_norm': 4.274756908416748, 'learning_rate': 4.4401776194834615e-06, 'ppl': 1.5071, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 362.24, 'total_tokens': 876703, 'epoch': 1.11} |
|
|
55
56
{'loss': 0.6118, 'grad_norm': 5.282613754272461, 'learning_rate': 4.3010483023225045e-06, 'ppl': 1.8437, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 293.01, 'total_tokens': 888584, 'epoch': 1.12} |
|
|
56
57
{'loss': 0.5366, 'grad_norm': 4.78963565826416, 'learning_rate': 4.162468883476319e-06, 'ppl': 1.7102, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 302.8, 'total_tokens': 900500, 'epoch': 1.14} |
|
|
57
58
{'loss': 0.4277, 'grad_norm': 4.404544353485107, 'learning_rate': 4.02454838991936e-06, 'ppl': 1.5337, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 387.06, 'total_tokens': 914828, 'epoch': 1.16} |
|
|
58
59
{'loss': 0.4992, 'grad_norm': 4.581356525421143, 'learning_rate': 3.887395330218429e-06, 'ppl': 1.6474, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 321.0, 'total_tokens': 927296, 'epoch': 1.18} |
|
|
59
60
{'loss': 0.4236, 'grad_norm': 3.748277425765991, 'learning_rate': 3.751117609163865e-06, 'ppl': 1.5275, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.86, 'total_tokens': 940968, 'epoch': 1.19} |
|
|
60
61
{'loss': 0.4085, 'grad_norm': 3.996558427810669, 'learning_rate': 3.6158224428757538e-06, 'ppl': 1.5046, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 319.06, 'total_tokens': 954163, 'epoch': 1.21} |
|
|
61
61
{'loss': 0.367, 'grad_norm': 3.7183310985565186, 'learning_rate': 3.4816162744519266e-06, 'ppl': 1.4434, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 392.87, 'total_tokens': 966263, 'epoch': 1.23} |
|
|
61
62
{'loss': 0.3049, 'grad_norm': 3.2405660152435303, 'learning_rate': 3.3486046902241663e-06, 'ppl': 1.3565, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 320.24, 'total_tokens': 979257, 'epoch': 1.25} |
|
|
62
63
{'loss': 0.4782, 'grad_norm': 3.7915146350860596, 'learning_rate': 3.216892336688435e-06, 'ppl': 1.6132, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 352.7, 'total_tokens': 991977, 'epoch': 1.26} |
|
|
63
64
{'loss': 0.4014, 'grad_norm': 3.5778920650482178, 'learning_rate': 3.0865828381745515e-06, 'ppl': 1.4939, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 333.21, 'total_tokens': 1005406, 'epoch': 1.28} |
|
|
64
65
{'loss': 0.5709, 'grad_norm': 5.925182819366455, 'learning_rate': 2.95777871532002e-06, 'ppl': 1.7699, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 291.94, 'total_tokens': 1016138, 'epoch': 1.3} |
|
|
65
66
{'loss': 0.4398, 'grad_norm': 4.066057205200195, 'learning_rate': 2.83058130441221e-06, 'ppl': 1.5524, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 397.09, 'total_tokens': 1029339, 'epoch': 1.32} |
|
|
66 |
|
|
[2026-01-05 06:24:09,576] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2422144412994385 |
|
|
[2026-01-05 06:24:11,841] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2649495601654053 |
|
|
[2026-01-05 06:24:14,135] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2932779788970947 |
|
|
[2026-01-05 06:24:16,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.286076068878174 |
|
|
[2026-01-05 06:24:16,423] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.8332963585853577, 'eval_runtime': 9.262, 'eval_samples_per_second': 2.051, 'eval_steps_per_second': 1.08, 'eval_ppl': 2.3009, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.32} |
|
|
66 |
|
|
100 |
|
|
[A
67
{'loss': 0.4122, 'grad_norm': 3.938491106033325, 'learning_rate': 2.705090677662311e-06, 'ppl': 1.5101, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 341.7, 'total_tokens': 1052283, 'epoch': 1.33} |
|
|
67
68
{'loss': 0.4916, 'grad_norm': 4.054359436035156, 'learning_rate': 2.5814055644738013e-06, 'ppl': 1.6349, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 417.0, 'total_tokens': 1065477, 'epoch': 1.35} |
|
|
68
68
{'loss': 0.3917, 'grad_norm': 4.133195877075195, 'learning_rate': 2.4596232737673544e-06, 'ppl': 1.4795, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 373.89, 'total_tokens': 1078206, 'epoch': 1.37} |
|
|
68
69
{'loss': 0.8396, 'grad_norm': 6.9484052658081055, 'learning_rate': 2.339839617423318e-06, 'ppl': 2.3154, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 398.68, 'total_tokens': 1092368, 'epoch': 1.39} |
|
|
69
70
{'loss': 0.5273, 'grad_norm': 5.065030574798584, 'learning_rate': 2.2221488349019903e-06, 'ppl': 1.6944, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 409.1, 'total_tokens': 1106741, 'epoch': 1.4} |
|
|
70
71
{'loss': 0.3963, 'grad_norm': 4.900491714477539, 'learning_rate': 2.1066435191009717e-06, 'ppl': 1.4863, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 430.18, 'total_tokens': 1120044, 'epoch': 1.42} |
|
|
71
72
{'loss': 0.4278, 'grad_norm': 3.917367458343506, 'learning_rate': 1.9934145435079705e-06, 'ppl': 1.5339, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 351.67, 'total_tokens': 1132761, 'epoch': 1.44} |
|
|
72
73
{'loss': 0.3575, 'grad_norm': 3.7160727977752686, 'learning_rate': 1.8825509907063328e-06, 'ppl': 1.4298, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 424.13, 'total_tokens': 1146953, 'epoch': 1.46} |
|
|
73
74
{'loss': 0.2989, 'grad_norm': 3.235100507736206, 'learning_rate': 1.7741400822895633e-06, 'ppl': 1.3484, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 435.06, 'total_tokens': 1162102, 'epoch': 1.47} |
|
|
74
75
{'loss': 0.4173, 'grad_norm': 3.7896361351013184, 'learning_rate': 1.6682671102399806e-06, 'ppl': 1.5179, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 414.25, 'total_tokens': 1176617, 'epoch': 1.49} |
|
|
75
75
{'loss': 0.5225, 'grad_norm': 3.7193076610565186, 'learning_rate': 1.5650153698254916e-06, 'ppl': 1.6862, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 386.25, 'total_tokens': 1189575, 'epoch': 1.51} |
|
|
75
76
{'loss': 0.4523, 'grad_norm': 3.9064459800720215, 'learning_rate': 1.4644660940672628e-06, 'ppl': 1.5719, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 376.18, 'total_tokens': 1203349, 'epoch': 1.53} |
|
|
76
77
{'loss': 0.4023, 'grad_norm': 3.633103847503662, 'learning_rate': 1.3666983898298659e-06, 'ppl': 1.4953, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.67, 'total_tokens': 1215720, 'epoch': 1.54} |
|
|
77
78
{'loss': 1.3284, 'grad_norm': 9.749296188354492, 'learning_rate': 1.2717891755841722e-06, 'ppl': 3.775, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 409.51, 'total_tokens': 1227412, 'epoch': 1.56} |
|
|
78
79
{'loss': 0.4467, 'grad_norm': 3.298785448074341, 'learning_rate': 1.1798131208919628e-06, 'ppl': 1.5631, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 417.1, 'total_tokens': 1241953, 'epoch': 1.58} |
|
|
79 |
|
|
[2026-01-05 06:28:54,407] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0476205348968506 |
|
|
[2026-01-05 06:28:56,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0721523761749268 |
|
|
[2026-01-05 06:28:58,500] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.020061492919922 |
|
|
[2026-01-05 06:29:00,540] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.0398201942443848 |
|
|
[2026-01-05 06:29:00,623] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.813373863697052, 'eval_runtime': 9.1778, 'eval_samples_per_second': 2.07, 'eval_steps_per_second': 1.09, 'eval_ppl': 2.2555, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.58} |
|
|
79 |
|
|
100 |
|
|
[A
80
{'loss': 0.4378, 'grad_norm': 3.9499611854553223, 'learning_rate': 1.0908425876598512e-06, 'ppl': 1.5493, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 388.15, 'total_tokens': 1265024, 'epoch': 1.6} |
|
|
80
81
{'loss': 0.4448, 'grad_norm': 4.025573253631592, 'learning_rate': 1.004947573208756e-06, 'ppl': 1.5602, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.45, 'total_tokens': 1278944, 'epoch': 1.61} |
|
|
81
82
{'loss': 0.4514, 'grad_norm': 4.208862781524658, 'learning_rate': 9.221956552036992e-07, 'ppl': 1.5705, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 415.53, 'total_tokens': 1293308, 'epoch': 1.63} |
|
|
82
82
{'loss': 0.4805, 'grad_norm': 5.097556114196777, 'learning_rate': 8.426519384872733e-07, 'ppl': 1.6169, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 402.99, 'total_tokens': 1307072, 'epoch': 1.65} |
|
|
82
83
{'loss': 0.7964, 'grad_norm': 5.547924995422363, 'learning_rate': 7.663790038585794e-07, 'ppl': 2.2175, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 300.56, 'total_tokens': 1319708, 'epoch': 1.67} |
|
|
83
84
{'loss': 0.4699, 'grad_norm': 3.6507017612457275, 'learning_rate': 6.934368588379553e-07, 'ppl': 1.5998, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 388.97, 'total_tokens': 1333331, 'epoch': 1.68} |
|
|
84
85
{'loss': 0.6074, 'grad_norm': 4.161734104156494, 'learning_rate': 6.238828904562316e-07, 'ppl': 1.8357, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 370.47, 'total_tokens': 1346546, 'epoch': 1.7} |
|
|
85
86
{'loss': 0.4091, 'grad_norm': 3.1512742042541504, 'learning_rate': 5.577718201056392e-07, 'ppl': 1.5055, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 400.58, 'total_tokens': 1360657, 'epoch': 1.72} |
|
|
86
87
{'loss': 0.4531, 'grad_norm': 3.652284860610962, 'learning_rate': 4.951556604879049e-07, 'ppl': 1.5732, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 378.33, 'total_tokens': 1373911, 'epoch': 1.74} |
|
|
87
88
{'loss': 0.6774, 'grad_norm': 13.29174518585205, 'learning_rate': 4.3608367469340553e-07, 'ppl': 1.9688, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 368.21, 'total_tokens': 1387840, 'epoch': 1.75} |
|
|
88
89
{'loss': 0.4309, 'grad_norm': 3.5832619667053223, 'learning_rate': 3.8060233744356634e-07, 'ppl': 1.5386, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 268.73, 'total_tokens': 1400206, 'epoch': 1.77} |
|
|
89
89
{'loss': 0.5536, 'grad_norm': 3.793043375015259, 'learning_rate': 3.287552985270015e-07, 'ppl': 1.7395, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.95, 'total_tokens': 1413244, 'epoch': 1.79} |
|
|
89
90
{'loss': 0.5032, 'grad_norm': 3.7566542625427246, 'learning_rate': 2.8058334845816214e-07, 'ppl': 1.654, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 425.22, 'total_tokens': 1425734, 'epoch': 1.81} |
|
|
90
91
{'loss': 0.4549, 'grad_norm': 3.206256866455078, 'learning_rate': 2.3612438638551837e-07, 'ppl': 1.576, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 388.45, 'total_tokens': 1439980, 'epoch': 1.82} |
|
|
91
92
{'loss': 0.6494, 'grad_norm': 4.32829475402832, 'learning_rate': 1.9541339027450256e-07, 'ppl': 1.9144, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 340.64, 'total_tokens': 1452051, 'epoch': 1.84} |
|
|
92 |
|
|
[2026-01-05 06:33:40,708] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1876299381256104 |
|
|
[2026-01-05 06:33:42,931] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2227494716644287 |
|
|
[2026-01-05 06:33:45,158] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.2258124351501465 |
|
|
[2026-01-05 06:33:47,352] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:505777] generate_batches time: 2.1935596466064453 |
|
|
[2026-01-05 06:33:47,353] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:505777] gather_len_batches: [5, 5] |
|
|
|
|
|
0 |
|
|
100
|
|
|
[A{'eval_loss': 0.8144508004188538, 'eval_runtime': 9.188, 'eval_samples_per_second': 2.068, 'eval_steps_per_second': 1.088, 'eval_ppl': 2.2579, 'memory/max_active (GiB)': 3.77, 'memory/max_allocated (GiB)': 3.77, 'memory/device_reserved (GiB)': 4.98, 'epoch': 1.84} |
|
|
92 |
|
|
100 |
|
|
[A
93
{'loss': 0.4638, 'grad_norm': 3.6060004234313965, 'learning_rate': 1.5848238938869332e-07, 'ppl': 1.5901, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 368.31, 'total_tokens': 1476384, 'epoch': 1.86} |
|
|
93
94
{'loss': 0.5655, 'grad_norm': 3.643385648727417, 'learning_rate': 1.253604390908819e-07, 'ppl': 1.7603, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 392.59, 'total_tokens': 1489347, 'epoch': 1.88} |
|
|
94
95
{'loss': 0.4432, 'grad_norm': 3.812006711959839, 'learning_rate': 9.607359798384785e-08, 'ppl': 1.5577, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 395.61, 'total_tokens': 1502251, 'epoch': 1.89} |
|
|
95
96
{'loss': 0.4601, 'grad_norm': 3.268007516860962, 'learning_rate': 7.064490740882057e-08, 'ppl': 1.5842, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 331.97, 'total_tokens': 1515211, 'epoch': 1.91} |
|
|
96
96
{'loss': 1.1937, 'grad_norm': 17.401044845581055, 'learning_rate': 4.909437331777178e-08, 'ppl': 3.2993, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 400.17, 'total_tokens': 1528121, 'epoch': 1.93} |
|
|
96
97
{'loss': 0.5327, 'grad_norm': 3.476621627807617, 'learning_rate': 3.143895053378698e-08, 'ppl': 1.7035, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.91, 'tokens_per_second_per_gpu': 412.86, 'total_tokens': 1541897, 'epoch': 1.95} |
|
|
97
98
{'loss': 0.6603, 'grad_norm': 5.652373313903809, 'learning_rate': 1.769252941190458e-08, 'ppl': 1.9354, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 396.43, 'total_tokens': 1555808, 'epoch': 1.96} |
|
|
98
99
{'loss': 0.6319, 'grad_norm': 4.798173427581787, 'learning_rate': 7.865924910916977e-09, 'ppl': 1.8812, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.5, 'tokens_per_second_per_gpu': 309.08, 'total_tokens': 1566355, 'epoch': 1.98} |
|
|
99
100
{'loss': 0.4319, 'grad_norm': 3.836909055709839, 'learning_rate': 1.9668680847356735e-09, 'ppl': 1.5402, 'memory/max_active (GiB)': 12.39, 'memory/max_allocated (GiB)': 12.0, 'memory/device_reserved (GiB)': 22.9, 'tokens_per_second_per_gpu': 292.8, 'total_tokens': 1577038, 'epoch': 2.0} |
|
|
100 |
|
|
warnings.warn( |
|
|
|
|
|
[2026-01-05 06:36:57,988] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed/checkpoint-114 |
|
|
{'train_runtime': 2526.3299, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.045, 'train_loss': 0.76008216357022, 'memory/max_active (GiB)': 9.02, 'memory/max_allocated (GiB)': 9.02, 'memory/device_reserved (GiB)': 9.82, 'epoch': 2.0} |
|
|
100
100 |
|
|
[2026-01-05 06:39:01,711] [INFO] [axolotl.train.save_trained_model:233] [PID:505777] Training completed! Saving trained model to loopstral-second-test/stage-3-healed. |
|
|
[2026-01-05 06:39:01,713] [WARNING] [py.warnings._showwarnmsg:110] [PID:505777] /home/aibox/axo/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html . |
|
|
warnings.warn( |
|
|
|
|
|
[2026-01-05 06:39:23,082] [INFO] [axolotl.core.trainers.base._save:692] [PID:505777] Saving model checkpoint to loopstral-second-test/stage-3-healed |
|
|
[2026-01-05 06:39:47,279] [INFO] [axolotl.train.save_trained_model:351] [PID:505777] Model successfully saved to loopstral-second-test/stage-3-healed |
|
|
[0m |