| [2026-01-04 00:10:03,594] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:15692] bf16 support detected, enabling for this configuration. | |
| [2026-01-04 00:10:03,667] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:15692] baseline 0.000GB () | |
| [2026-01-04 00:10:03,668] [INFO] [axolotl.cli.config.load_cfg:259] [PID:15692] config: | |
| { | |
| "activation_offloading": true, | |
| "adapter": "lora", | |
| "axolotl_config_path": "train.yml", | |
| "base_model": "shb777/Llama-3.3-8B-Instruct-128K", | |
| "base_model_config": "shb777/Llama-3.3-8B-Instruct-128K", | |
| "batch_size": 4, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_86", | |
| "fp8": false, | |
| "n_gpu": 1, | |
| "n_node": 1 | |
| }, | |
| "chat_template": "llama3", | |
| "context_parallel_size": 1, | |
| "cut_cross_entropy": true, | |
| "dataloader_num_workers": 1, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_num_proc": 96, | |
| "dataset_prepared_path": "dataset_prepareds", | |
| "datasets": [ | |
| { | |
| "chat_template": "tokenizer_default", | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "WokeAI/polititune-tankie-warmup", | |
| "split": "train", | |
| "trust_remote_code": false, | |
| "type": "chat_template" | |
| } | |
| ], | |
| "ddp": false, | |
| "device": "cuda:0", | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.8.0" | |
| }, | |
| "eval_batch_size": 2, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_sample_packing": false, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "flash_attention": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 2, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": true | |
| }, | |
| "group_by_length": false, | |
| "include_tkps": true, | |
| "is_llama_derived_model": true, | |
| "learning_rate": 1e-05, | |
| "liger_fused_linear_cross_entropy": false, | |
| "liger_glu_activation": true, | |
| "liger_layer_norm": true, | |
| "liger_rms_norm": true, | |
| "liger_rope": true, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "logging_steps": 1, | |
| "lora_alpha": 16, | |
| "lora_dropout": 0.35, | |
| "lora_r": 64, | |
| "lora_target_linear": true, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "rex", | |
| "max_grad_norm": 0.1, | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 2, | |
| "model_config_type": "llama", | |
| "num_epochs": 2.0, | |
| "optimizer": "adamw_torch_8bit", | |
| "otel_metrics_host": "localhost", | |
| "otel_metrics_port": 8000, | |
| "output_dir": "./output", | |
| "pad_to_sequence_len": true, | |
| "peft_use_rslora": true, | |
| "plugins": [ | |
| "axolotl.integrations.liger.LigerPlugin", | |
| "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" | |
| ], | |
| "pretrain_multipack_attn": true, | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "remove_unused_columns": false, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing": true, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": false, | |
| "save_safetensors": true, | |
| "sequence_len": 4096, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "special_tokens": { | |
| "pad_token": "<|reserved_special_token_2|>" | |
| }, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tf32": false, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "shb777/Llama-3.3-8B-Instruct-128K", | |
| "tokenizer_save_jinja_files": true, | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "use_otel_metrics": false, | |
| "use_ray": false, | |
| "use_wandb": true, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "wandb_project": "newyear", | |
| "weight_decay": 0.0, | |
| "world_size": 1 | |
| } | |
| [2026-01-04 00:10:03,672] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:15692] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets. | |
| [2026-01-04 00:10:04,197] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <|eot_id|> | |
| [2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <|begin_of_text|> | |
| [2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <|reserved_special_token_2|> | |
| [2026-01-04 00:10:04,198] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None | |
| [2026-01-04 00:10:04,199] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:15692] Loading prepared dataset from disk at dataset_prepareds/a420619428aa6c5576289a496238883a... | |
| [2026-01-04 00:10:04,213] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:15692] total_num_tokens: 684_427 | |
| [2026-01-04 00:10:04,225] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:15692] `total_supervised_tokens: 498_319` | |
| [2026-01-04 00:10:04,242] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially. | |
| [2026-01-04 00:10:05,245] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially. | |
| [2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.21780610084533691 | |
| [2026-01-04 00:10:05,463] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially. | |
| [2026-01-04 00:10:05,685] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.22262859344482422 | |
| [2026-01-04 00:10:05,686] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially. | |
| [2026-01-04 00:10:05,930] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.2447659969329834 | |
| [2026-01-04 00:10:05,931] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:15692] Using single process for pack_parallel, running sequentially. | |
| [2026-01-04 00:10:06,188] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.25751280784606934 | |
| [2026-01-04 00:10:06,211] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84] | |
| [2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:494] [PID:15692] data_loader_len: 42 | |
| [2026-01-04 00:10:06,212] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:510] [PID:15692] sample_packing_eff_est across ranks: [0.9946216401599702] | |
| [2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:15692] sample_packing_eff_est: 1.0 | |
| [2026-01-04 00:10:06,212] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:15692] total_num_steps: 84 | |
| [2026-01-04 00:10:06,212] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:15692] Maximum number of steps set at 84 | |
| [2026-01-04 00:10:06,240] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:15692] loading tokenizer... shb777/Llama-3.3-8B-Instruct-128K | |
| [2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:15692] EOS: 128009 / <|eot_id|> | |
| [2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:15692] BOS: 128000 / <|begin_of_text|> | |
| [2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:15692] PAD: 128004 / <|reserved_special_token_2|> | |
| [2026-01-04 00:10:06,719] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:15692] UNK: None / None | |
| [2026-01-04 00:10:06,719] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:15692] Loading model | |
| [2026-01-04 00:10:06,766] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:15692] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2026-01-04 00:10:06,767] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:15692] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2026-01-04 00:10:06,768] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:344] [PID:15692] Applying multipack dataloader patch for sample packing... | |
| [2026-01-04 00:10:06,873] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:15692] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': False, 'rms_norm': True, 'swiglu': True} | |
| [2026-01-04 00:10:07,074] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:15692] Applying Cut Cross Entropy to model type: llama | |
| Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s] Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:00<00:00, 133.41it/s] | |
| [2026-01-04 00:10:09,611] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:15692] Converting modules to torch.bfloat16 | |
| [2026-01-04 00:10:10,928] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] Memory usage after model load 0.000GB () | |
| [2026-01-04 00:10:10,929] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:15692] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj'] | |
| trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465 | |
| [2026-01-04 00:10:12,458] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:15692] after adapters 0.000GB () | |
| [2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:413] [PID:15692] Pre-saving adapter config to ./output... | |
| [2026-01-04 00:10:20,773] [INFO] [axolotl.train.save_initial_configs:417] [PID:15692] Pre-saving tokenizer to ./output... | |
| [2026-01-04 00:10:20,929] [INFO] [axolotl.train.save_initial_configs:422] [PID:15692] Pre-saving model config to ./output... | |
| [2026-01-04 00:10:20,931] [INFO] [axolotl.train.execute_training:212] [PID:15692] Starting trainer... | |
| [2026-01-04 00:10:22,337] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.49930906295776367 | |
| [2026-01-04 00:10:22,811] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.47399091720581055 | |
| [2026-01-04 00:10:23,297] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.4854564666748047 | |
| [2026-01-04 00:10:23,802] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:15692] generate_batches time: 0.5046358108520508 | |
| [2026-01-04 00:10:23,802] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:15692] gather_len_batches: [84] | |
| [34m[1mwandb[0m: (1) Create a W&B account | |
| [34m[1mwandb[0m: (2) Use an existing W&B account | |
| [34m[1mwandb[0m: (3) Don't visualize my results | |
| [34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account' | |
| [34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server) | |
| [34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models | |
| [34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one. | |
| [34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc | |
| [34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin | |
| [34m[1mwandb[0m: [38;5;178mβ’Ώ[0m Waiting for wandb.init()... | |
| [Am[2K [34m[1mwandb[0m: [38;5;178mβ£»[0m Waiting for wandb.init()... | |
| [Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.23.1 | |
| [34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20260104_001243-myor4kbd[0m | |
| [34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing. | |
| [34m[1mwandb[0m: Syncing run [33mdistinctive-firebrand-5[0m | |
| [34m[1mwandb[0m: βοΈ View project at [34m[4mhttps://wandb.ai/fizzzz/newyear[0m | |
| [34m[1mwandb[0m: π View run at [34m[4mhttps://wandb.ai/fizzzz/newyear/runs/myor4kbd[0m | |
| [34m[1mwandb[0m: Detected [huggingface_hub.inference] in use. | |
| [34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. | |
| [34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/ | |
| [34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") | |
| [34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files. | |
| [2026-01-04 00:12:45,380] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:15692] The Axolotl config has been saved to the WandB run under files. | |
| 0%| | 0/84 [00:00<?, ?it/s] 1%|ββ | 1/84 [00:36<50:25, 36.45s/it] {'loss': 3.0634, 'grad_norm': 8.193868637084961, 'learning_rate': 4.999999873689376e-06, 'ppl': 21.40019, 'memory/max_active (GiB)': 20.05, 'memory/max_allocated (GiB)': 20.05, 'memory/device_reserved (GiB)': 20.62, 'tokens/train_per_sec_per_gpu': 155.38671875, 'epoch': 0.02, 'tokens/total': 16384.0, 'tokens/trainable': 9972.0} | |
| 1%|ββ | 1/84 [00:36<50:25, 36.45s/it] 2%|ββββ | 2/84 [00:53<33:55, 24.83s/it] {'loss': 3.2197, 'grad_norm': 8.957365036010742, 'learning_rate': 9.999999747378752e-06, 'ppl': 25.02061, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.190185546875, 'epoch': 0.05, 'tokens/total': 32768.0, 'tokens/trainable': 21140.0} | |
| 2%|ββββ | 2/84 [00:53<33:55, 24.83s/it] 4%|βββββββ | 3/84 [01:08<27:24, 20.31s/it] {'loss': 3.0878, 'grad_norm': 7.201009273529053, 'learning_rate': 9.987669727706816e-06, 'ppl': 21.92878, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 446.0367736816406, 'epoch': 0.07, 'tokens/total': 49152.0, 'tokens/trainable': 33455.0} | |
| 4%|βββββββ | 3/84 [01:08<27:24, 20.31s/it] 5%|βββββββββ | 4/84 [01:22<24:13, 18.17s/it] {'loss': 3.0947, 'grad_norm': 6.2549729347229, 'learning_rate': 9.97506231215084e-06, 'ppl': 22.08061, 'memory/max_active (GiB)': 20.37, 'memory/max_allocated (GiB)': 20.37, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 355.3751220703125, 'epoch': 0.1, 'tokens/total': 65536.0, 'tokens/trainable': 44394.0} | |
| 5%|βββββββββ | 4/84 [01:22<24:13, 18.17s/it] 6%|βββββββββββ | 5/84 [01:39<23:21, 17.74s/it] {'loss': 2.6876, 'grad_norm': 5.082422733306885, 'learning_rate': 9.962168405763805e-06, 'ppl': 14.69636, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 351.228515625, 'epoch': 0.12, 'tokens/total': 81920.0, 'tokens/trainable': 56961.0} | |
| 6%|βββββββββββ | 5/84 [01:39<23:21, 17.74s/it] 7%|βββββββββββββ | 6/84 [01:54<21:49, 16.78s/it] {'loss': 3.0661, 'grad_norm': 4.808957099914551, 'learning_rate': 9.948978913598694e-06, 'ppl': 21.45805, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 418.3861083984375, 'epoch': 0.14, 'tokens/total': 98304.0, 'tokens/trainable': 69788.0} | |
| 7%|βββββββββββββ | 6/84 [01:54<21:49, 16.78s/it] 8%|βββββββββββββββ | 7/84 [02:09<20:46, 16.19s/it] {'loss': 2.7881, 'grad_norm': 3.901789426803589, 'learning_rate': 9.935483831213787e-06, 'ppl': 16.25012, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.87, 'tokens/train_per_sec_per_gpu': 364.1433410644531, 'epoch': 0.17, 'tokens/total': 114688.0, 'tokens/trainable': 81854.0} | |
| 8%|βββββββββββββββ | 7/84 [02:09<20:46, 16.19s/it] 10%|βββββββββββββββββ | 8/84 [02:24<20:00, 15.80s/it] {'loss': 2.556, 'grad_norm': 3.0737860202789307, 'learning_rate': 9.921671335177962e-06, 'ppl': 12.88418, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 393.31671142578125, 'epoch': 0.19, 'tokens/total': 131072.0, 'tokens/trainable': 94072.0} | |
| 10%|βββββββββββββββββ | 8/84 [02:24<20:00, 15.80s/it] 11%|βββββββββββββββββββ | 9/84 [02:39<19:25, 15.54s/it] {'loss': 2.8264, 'grad_norm': 2.6989502906799316, 'learning_rate': 9.907529602060094e-06, 'ppl': 16.88457, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.95263671875, 'epoch': 0.21, 'tokens/total': 147456.0, 'tokens/trainable': 106403.0} | |
| 11%|βββββββββββββββββββ | 9/84 [02:39<19:25, 15.54s/it] 12%|βββββββββββββββββββββ | 10/84 [02:54<18:59, 15.39s/it] {'loss': 2.7276, 'grad_norm': 1.9797707796096802, 'learning_rate': 9.893047717923764e-06, 'ppl': 15.29613, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.0211181640625, 'epoch': 0.24, 'tokens/total': 163840.0, 'tokens/trainable': 119564.0} | |
| 12%|βββββββββββββββββββββ | 10/84 [02:54<18:59, 15.39s/it] 13%|βββββββββββββββββββββββ | 11/84 [03:09<18:36, 15.29s/it] {'loss': 2.7648, 'grad_norm': 1.7535356283187866, 'learning_rate': 9.878213859337848e-06, 'ppl': 15.87586, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.7185974121094, 'epoch': 0.26, 'tokens/total': 180224.0, 'tokens/trainable': 130051.0} | |
| 13%|βββββββββββββββββββββββ | 11/84 [03:09<18:36, 15.29s/it] 14%|βββββββββββββββββββββββββ | 12/84 [03:24<18:16, 15.23s/it] {'loss': 2.7597, 'grad_norm': 1.589630126953125, 'learning_rate': 9.863013474387117e-06, 'ppl': 15.7951, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 436.2579345703125, 'epoch': 0.29, 'tokens/total': 196608.0, 'tokens/trainable': 141402.0} | |
| 14%|βββββββββββββββββββββββββ | 12/84 [03:24<18:16, 15.23s/it] 15%|βββββββββββββββββββββββββββ | 13/84 [03:39<17:56, 15.16s/it] {'loss': 2.371, 'grad_norm': 1.2782939672470093, 'learning_rate': 9.847433830145746e-06, 'ppl': 10.7081, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 380.26220703125, 'epoch': 0.31, 'tokens/total': 212992.0, 'tokens/trainable': 152557.0} | |
| 15%|βββββββββββββββββββββββββββ | 13/84 [03:39<17:56, 15.16s/it] 17%|βββββββββββββββββββββββββββββ | 14/84 [03:55<17:39, 15.13s/it] {'loss': 2.3513, 'grad_norm': 1.4143847227096558, 'learning_rate': 9.831460374698509e-06, 'ppl': 10.49921, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 394.7943115234375, 'epoch': 0.33, 'tokens/total': 229376.0, 'tokens/trainable': 165183.0} | |
| 17%|βββββββββββββββββββββββββββββ | 14/84 [03:55<17:39, 15.13s/it] 18%|βββββββββββββββββββββββββββββββ | 15/84 [04:10<17:23, 15.13s/it] {'loss': 2.7007, 'grad_norm': 1.272851586341858, 'learning_rate': 9.815078556130175e-06, 'ppl': 14.89015, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.2015380859375, 'epoch': 0.36, 'tokens/total': 245760.0, 'tokens/trainable': 176902.0} | |
| 18%|βββββββββββββββββββββββββββββββ | 15/84 [04:10<17:23, 15.13s/it] 19%|βββββββββββββββββββββββββββββββββ | 16/84 [04:25<17:07, 15.11s/it] {'loss': 2.6622, 'grad_norm': 1.1848429441452026, 'learning_rate': 9.79827109404141e-06, 'ppl': 14.32778, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.4239501953125, 'epoch': 0.38, 'tokens/total': 262144.0, 'tokens/trainable': 189732.0} | |
| 19%|βββββββββββββββββββββββββββββββββ | 16/84 [04:25<17:07, 15.11s/it] 20%|βββββββββββββββββββββββββββββββββββ | 17/84 [04:40<16:50, 15.08s/it] {'loss': 2.7354, 'grad_norm': 1.4691566228866577, 'learning_rate': 9.781021617527585e-06, 'ppl': 15.41591, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 288.30322265625, 'epoch': 0.4, 'tokens/total': 278528.0, 'tokens/trainable': 199657.0} | |
| 20%|βββββββββββββββββββββββββββββββββββ | 17/84 [04:40<16:50, 15.08s/it] 21%|βββββββββββββββββββββββββββββββββββββ | 18/84 [04:55<16:33, 15.06s/it] {'loss': 2.5652, 'grad_norm': 1.199852466583252, 'learning_rate': 9.763312846189365e-06, 'ppl': 13.00326, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 437.531982421875, 'epoch': 0.43, 'tokens/total': 294912.0, 'tokens/trainable': 211618.0} | |
| 21%|βββββββββββββββββββββββββββββββββββββ | 18/84 [04:55<16:33, 15.06s/it] 23%|βββββββββββββββββββββββββββββββββββββββ | 19/84 [05:10<16:18, 15.05s/it] {'loss': 2.4957, 'grad_norm': 1.1276423931121826, 'learning_rate': 9.745127499627415e-06, 'ppl': 12.13022, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.6016540527344, 'epoch': 0.45, 'tokens/total': 311296.0, 'tokens/trainable': 223722.0} | |
| 23%|βββββββββββββββββββββββββββββββββββββββ | 19/84 [05:10<16:18, 15.05s/it] 24%|βββββββββββββββββββββββββββββββββββββββββ | 20/84 [05:25<16:03, 15.05s/it] {'loss': 2.5416, 'grad_norm': 1.1429506540298462, 'learning_rate': 9.726443749968894e-06, 'ppl': 12.69997, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 474.4263000488281, 'epoch': 0.48, 'tokens/total': 327680.0, 'tokens/trainable': 236639.0} | |
| 24%|βββββββββββββββββββββββββββββββββββββββββ | 20/84 [05:25<16:03, 15.05s/it] 25%|βββββββββββββββββββββββββββββββββββββββββββ | 21/84 [05:40<15:48, 15.05s/it] {'loss': 2.437, 'grad_norm': 0.9757459759712219, 'learning_rate': 9.707241588330362e-06, 'ppl': 11.43867, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 408.1600341796875, 'epoch': 0.5, 'tokens/total': 344064.0, 'tokens/trainable': 248033.0} | |
| 25%|βββββββββββββββββββββββββββββββββββββββββββ | 21/84 [05:40<15:48, 15.05s/it] 26%|βββββββββββββββββββββββββββββββββββββββββββββ | 22/84 [05:55<15:34, 15.08s/it] {'loss': 2.2622, 'grad_norm': 0.9250922799110413, 'learning_rate': 9.687500096333679e-06, 'ppl': 9.6042, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 341.8017578125, 'epoch': 0.52, 'tokens/total': 360448.0, 'tokens/trainable': 259928.0} | |
| 26%|βββββββββββββββββββββββββββββββββββββββββββββ | 22/84 [05:55<15:34, 15.08s/it] 27%|βββββββββββββββββββββββββββββββββββββββββββββββ | 23/84 [06:10<15:20, 15.10s/it] {'loss': 2.6973, 'grad_norm': 1.0023396015167236, 'learning_rate': 9.667194717621896e-06, 'ppl': 14.83961, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 456.94940185546875, 'epoch': 0.55, 'tokens/total': 376832.0, 'tokens/trainable': 273229.0} | |
| 27%|βββββββββββββββββββββββββββββββββββββββββββββββ | 23/84 [06:10<15:20, 15.10s/it] 29%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 24/84 [06:25<15:05, 15.09s/it] {'loss': 2.4554, 'grad_norm': 0.8203016519546509, 'learning_rate': 9.646301805332769e-06, 'ppl': 11.65109, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 377.00543212890625, 'epoch': 0.57, 'tokens/total': 393216.0, 'tokens/trainable': 285582.0} | |
| 29%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 24/84 [06:25<15:05, 15.09s/it] 30%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 25/84 [06:40<14:48, 15.06s/it] {'loss': 2.4833, 'grad_norm': 0.9041063785552979, 'learning_rate': 9.624795893614646e-06, 'ppl': 11.98074, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 307.10601806640625, 'epoch': 0.6, 'tokens/total': 409600.0, 'tokens/trainable': 296666.0} | |
| 30%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 25/84 [06:40<14:48, 15.06s/it] 31%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 26/84 [06:55<14:34, 15.07s/it] {'loss': 2.5628, 'grad_norm': 0.9094843864440918, 'learning_rate': 9.602648788131773e-06, 'ppl': 12.97209, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 424.59381103515625, 'epoch': 0.62, 'tokens/total': 425984.0, 'tokens/trainable': 309496.0} | |
| 31%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 26/84 [06:55<14:34, 15.07s/it] 32%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 27/84 [07:10<14:19, 15.07s/it] {'loss': 2.5322, 'grad_norm': 0.9065321683883667, 'learning_rate': 9.579831385053694e-06, 'ppl': 12.58115, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.19561767578125, 'epoch': 0.64, 'tokens/total': 442368.0, 'tokens/trainable': 323050.0} | |
| 32%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 27/84 [07:10<14:19, 15.07s/it] 33%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 28/84 [07:25<14:03, 15.06s/it] {'loss': 2.4975, 'grad_norm': 0.9096329212188721, 'learning_rate': 9.55631367105525e-06, 'ppl': 12.15208, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 276.995361328125, 'epoch': 0.67, 'tokens/total': 458752.0, 'tokens/trainable': 332229.0} | |
| 33%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 28/84 [07:25<14:03, 15.06s/it] 35%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 29/84 [07:41<13:48, 15.07s/it] {'loss': 2.2896, 'grad_norm': 0.811943531036377, 'learning_rate': 9.532061994832475e-06, 'ppl': 9.87099, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.5038146972656, 'epoch': 0.69, 'tokens/total': 475136.0, 'tokens/trainable': 342910.0} | |
| 35%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 29/84 [07:41<13:48, 15.07s/it] 36%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 30/84 [07:56<13:32, 15.05s/it] {'loss': 2.1711, 'grad_norm': 0.7714135646820068, 'learning_rate': 9.507041795586701e-06, 'ppl': 8.76792, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.57464599609375, 'epoch': 0.71, 'tokens/total': 491520.0, 'tokens/trainable': 354990.0} | |
| 36%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 30/84 [07:56<13:32, 15.05s/it] 37%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 31/84 [08:11<13:18, 15.06s/it] {'loss': 2.5228, 'grad_norm': 0.7664998769760132, 'learning_rate': 9.48121669352986e-06, 'ppl': 12.46345, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 419.0357666015625, 'epoch': 0.74, 'tokens/total': 507904.0, 'tokens/trainable': 368404.0} | |
| 37%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 31/84 [08:11<13:18, 15.06s/it] 38%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 32/84 [08:26<13:03, 15.06s/it] {'loss': 2.4413, 'grad_norm': 0.7703967094421387, 'learning_rate': 9.454544851905666e-06, 'ppl': 11.48797, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.70343017578125, 'epoch': 0.76, 'tokens/total': 524288.0, 'tokens/trainable': 381274.0} | |
| 38%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 32/84 [08:26<13:03, 15.06s/it] 39%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 33/84 [08:41<12:47, 15.04s/it] {'loss': 2.3928, 'grad_norm': 0.8420023918151855, 'learning_rate': 9.426987162441947e-06, 'ppl': 10.94409, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 350.9320983886719, 'epoch': 0.79, 'tokens/total': 540672.0, 'tokens/trainable': 392843.0} | |
| 39%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 33/84 [08:41<12:47, 15.04s/it] 40%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 34/84 [08:56<12:33, 15.07s/it] {'loss': 2.4511, 'grad_norm': 0.7319009900093079, 'learning_rate': 9.398496331414208e-06, 'ppl': 11.6011, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 461.1392822265625, 'epoch': 0.81, 'tokens/total': 557056.0, 'tokens/trainable': 406127.0} | |
| 40%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 34/84 [08:56<12:33, 15.07s/it] 42%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 35/84 [09:11<12:18, 15.07s/it] {'loss': 2.6036, 'grad_norm': 0.845214307308197, 'learning_rate': 9.369024155603256e-06, 'ppl': 13.51229, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.6831359863281, 'epoch': 0.83, 'tokens/total': 573440.0, 'tokens/trainable': 416879.0} | |
| 42%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 35/84 [09:11<12:18, 15.07s/it] 43%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 36/84 [09:26<12:02, 15.06s/it] {'loss': 2.3314, 'grad_norm': 0.7980889678001404, 'learning_rate': 9.338521522295196e-06, 'ppl': 10.29234, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 359.2618103027344, 'epoch': 0.86, 'tokens/total': 589824.0, 'tokens/trainable': 428113.0} | |
| 43%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 36/84 [09:26<12:02, 15.06s/it] 44%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 37/84 [09:41<11:49, 15.09s/it] {'loss': 2.4644, 'grad_norm': 0.7808263897895813, 'learning_rate': 9.306930223829113e-06, 'ppl': 11.75643, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.8631591796875, 'epoch': 0.88, 'tokens/total': 606208.0, 'tokens/trainable': 441366.0} | |
| 44%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 37/84 [09:41<11:49, 15.09s/it] 45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 38/84 [09:56<11:33, 15.07s/it] {'loss': 1.9823, 'grad_norm': 0.7255628705024719, 'learning_rate': 9.274192962038796e-06, 'ppl': 7.25942, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 435.92559814453125, 'epoch': 0.9, 'tokens/total': 622592.0, 'tokens/trainable': 453641.0} | |
| 45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 38/84 [09:56<11:33, 15.07s/it] 46%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 39/84 [10:11<11:17, 15.05s/it] {'loss': 2.3952, 'grad_norm': 0.8048214316368103, 'learning_rate': 9.240246072295122e-06, 'ppl': 10.97039, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 308.4759521484375, 'epoch': 0.93, 'tokens/total': 638976.0, 'tokens/trainable': 465009.0} | |
| 46%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 39/84 [10:11<11:17, 15.05s/it] 48%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 40/84 [10:26<11:02, 15.05s/it] {'loss': 2.119, 'grad_norm': 0.834456741809845, 'learning_rate': 9.205020433000755e-06, 'ppl': 8.32281, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 238.43943786621094, 'epoch': 0.95, 'tokens/total': 655360.0, 'tokens/trainable': 474320.0} | |
| 48%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 40/84 [10:26<11:02, 15.05s/it] 49%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 41/84 [10:41<10:47, 15.05s/it] {'loss': 2.1589, 'grad_norm': 0.7231636047363281, 'learning_rate': 9.168443284579553e-06, 'ppl': 8.6616, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 387.91583251953125, 'epoch': 0.98, 'tokens/total': 671744.0, 'tokens/trainable': 486804.0} | |
| 49%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 41/84 [10:41<10:47, 15.05s/it] 50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 42/84 [10:56<10:32, 15.07s/it] {'loss': 2.2662, 'grad_norm': 0.7623542547225952, 'learning_rate': 9.13043459149776e-06, 'ppl': 9.64269, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 316.7720031738281, 'epoch': 1.0, 'tokens/total': 688128.0, 'tokens/trainable': 498319.0} | |
| 50%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 42/84 [10:56<10:32, 15.07s/it][2026-01-04 00:23:42,204] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-42 | |
| 51%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 43/84 [11:15<11:01, 16.15s/it] {'loss': 2.364, 'grad_norm': 0.820637583732605, 'learning_rate': 9.09090886125341e-06, 'ppl': 10.6334, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 388.0730285644531, 'epoch': 1.02, 'tokens/total': 704512.0, 'tokens/trainable': 509948.0} | |
| 51%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 43/84 [11:15<11:01, 16.15s/it] 52%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 44/84 [11:30<10:33, 15.83s/it] {'loss': 2.2541, 'grad_norm': 0.7583165168762207, 'learning_rate': 9.049773325386923e-06, 'ppl': 9.52672, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 406.7909851074219, 'epoch': 1.05, 'tokens/total': 720896.0, 'tokens/trainable': 521725.0} | |
| 52%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 44/84 [11:30<10:33, 15.83s/it] 54%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 45/84 [11:45<10:07, 15.58s/it] {'loss': 2.184, 'grad_norm': 0.7426517009735107, 'learning_rate': 9.006927939481102e-06, 'ppl': 8.88176, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.7640380859375, 'epoch': 1.07, 'tokens/total': 737280.0, 'tokens/trainable': 533462.0} | |
| 54%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 45/84 [11:45<10:07, 15.58s/it] 55%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 46/84 [12:00<09:45, 15.42s/it] {'loss': 2.2603, 'grad_norm': 0.7592601776123047, 'learning_rate': 8.962263564171735e-06, 'ppl': 9.58596, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.5147399902344, 'epoch': 1.1, 'tokens/total': 753664.0, 'tokens/trainable': 544963.0} | |
| 55%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 46/84 [12:00<09:45, 15.42s/it] 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 47/84 [12:15<09:27, 15.33s/it] {'loss': 2.3293, 'grad_norm': 0.712399423122406, 'learning_rate': 8.915662874642294e-06, 'ppl': 10.27075, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.820068359375, 'epoch': 1.12, 'tokens/total': 770048.0, 'tokens/trainable': 557568.0} | |
| 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 47/84 [12:15<09:27, 15.33s/it] 57%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 48/84 [12:30<09:10, 15.28s/it] {'loss': 2.3301, 'grad_norm': 0.6970159411430359, 'learning_rate': 8.866994903655723e-06, 'ppl': 10.27897, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 479.2054443359375, 'epoch': 1.14, 'tokens/total': 786432.0, 'tokens/trainable': 571098.0} | |
| 57%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 48/84 [12:30<09:10, 15.28s/it] 58%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 49/84 [12:46<08:53, 15.24s/it] {'loss': 2.1423, 'grad_norm': 0.8117857575416565, 'learning_rate': 8.81612049852265e-06, 'ppl': 8.51901, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 311.01019287109375, 'epoch': 1.17, 'tokens/total': 802816.0, 'tokens/trainable': 580687.0} | |
| 58%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 49/84 [12:46<08:53, 15.24s/it] 60%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 50/84 [13:01<08:37, 15.21s/it] {'loss': 2.287, 'grad_norm': 0.7534531950950623, 'learning_rate': 8.762885954638477e-06, 'ppl': 9.84536, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 460.1733703613281, 'epoch': 1.19, 'tokens/total': 819200.0, 'tokens/trainable': 593415.0} | |
| 60%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 50/84 [13:01<08:37, 15.21s/it] 61%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 51/84 [13:16<08:21, 15.18s/it] {'loss': 2.2903, 'grad_norm': 0.7555272579193115, 'learning_rate': 8.707123924978077e-06, 'ppl': 9.8779, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 357.5971374511719, 'epoch': 1.21, 'tokens/total': 835584.0, 'tokens/trainable': 605122.0} | |
| 61%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 51/84 [13:16<08:21, 15.18s/it] 62%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 52/84 [13:31<08:04, 15.16s/it] {'loss': 2.0339, 'grad_norm': 0.7037354707717896, 'learning_rate': 8.648648872622289e-06, 'ppl': 7.64384, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 445.2397155761719, 'epoch': 1.24, 'tokens/total': 851968.0, 'tokens/trainable': 617663.0} | |
| 62%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 52/84 [13:31<08:04, 15.16s/it] 63%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 53/84 [13:46<07:49, 15.14s/it] {'loss': 2.208, 'grad_norm': 0.7909743189811707, 'learning_rate': 8.587257980252616e-06, 'ppl': 9.0975, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 342.66082763671875, 'epoch': 1.26, 'tokens/total': 868352.0, 'tokens/trainable': 628294.0} | |
| 63%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 53/84 [13:46<07:49, 15.14s/it] 64%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 54/84 [14:01<07:33, 15.12s/it] {'loss': 2.0099, 'grad_norm': 0.9540871977806091, 'learning_rate': 8.522727512172423e-06, 'ppl': 7.46257, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 331.34002685546875, 'epoch': 1.29, 'tokens/total': 884736.0, 'tokens/trainable': 639131.0} | |
| 64%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 54/84 [14:01<07:33, 15.12s/it] 65%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 55/84 [14:16<07:18, 15.11s/it] {'loss': 2.1202, 'grad_norm': 0.7127824425697327, 'learning_rate': 8.454810085822828e-06, 'ppl': 8.3328, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 309.6125793457031, 'epoch': 1.31, 'tokens/total': 901120.0, 'tokens/trainable': 650860.0} | |
| 65%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 55/84 [14:16<07:18, 15.11s/it] 67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 56/84 [14:31<07:02, 15.08s/it] {'loss': 2.2369, 'grad_norm': 0.8307490944862366, 'learning_rate': 8.383233762288e-06, 'ppl': 9.36426, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 360.55908203125, 'epoch': 1.33, 'tokens/total': 917504.0, 'tokens/trainable': 660335.0} | |
| 67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 56/84 [14:31<07:02, 15.08s/it] 68%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 57/84 [14:46<06:47, 15.10s/it] {'loss': 2.4058, 'grad_norm': 0.8058661818504333, 'learning_rate': 8.307692041853443e-06, 'ppl': 11.0873, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 369.5503845214844, 'epoch': 1.36, 'tokens/total': 933888.0, 'tokens/trainable': 672664.0} | |
| 68%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 57/84 [14:46<06:47, 15.10s/it] 69%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 58/84 [15:01<06:32, 15.10s/it] {'loss': 2.2731, 'grad_norm': 0.7699398994445801, 'learning_rate': 8.227847501984797e-06, 'ppl': 9.70945, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 386.2121887207031, 'epoch': 1.38, 'tokens/total': 950272.0, 'tokens/trainable': 685353.0} | |
| 69%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 58/84 [15:01<06:32, 15.10s/it] 70%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 59/84 [15:17<06:17, 15.09s/it] {'loss': 2.2848, 'grad_norm': 0.7645865678787231, 'learning_rate': 8.143322702380829e-06, 'ppl': 9.82372, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 433.5550842285156, 'epoch': 1.4, 'tokens/total': 966656.0, 'tokens/trainable': 697377.0} | |
| 70%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 59/84 [15:17<06:17, 15.09s/it] 71%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 60/84 [15:32<06:01, 15.08s/it] {'loss': 2.2004, 'grad_norm': 0.7868902087211609, 'learning_rate': 8.053691090026405e-06, 'ppl': 9.02862, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 322.94708251953125, 'epoch': 1.43, 'tokens/total': 983040.0, 'tokens/trainable': 708580.0} | |
| 71%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 60/84 [15:32<06:01, 15.08s/it] 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 61/84 [15:47<05:47, 15.10s/it] {'loss': 2.1096, 'grad_norm': 0.8960193991661072, 'learning_rate': 7.9584779086872e-06, 'ppl': 8.24494, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 378.5703125, 'epoch': 1.45, 'tokens/total': 999424.0, 'tokens/trainable': 721379.0} | |
| 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 61/84 [15:47<05:47, 15.10s/it] 74%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 62/84 [16:02<05:31, 15.08s/it] {'loss': 1.994, 'grad_norm': 0.7898715138435364, 'learning_rate': 7.857142918510363e-06, 'ppl': 7.34485, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 345.618408203125, 'epoch': 1.48, 'tokens/total': 1015808.0, 'tokens/trainable': 732230.0} | |
| 74%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 62/84 [16:02<05:31, 15.08s/it] 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 63/84 [16:17<05:16, 15.09s/it] {'loss': 2.1251, 'grad_norm': 0.7547686100006104, 'learning_rate': 7.749077667540405e-06, 'ppl': 8.37373, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.9949951171875, 'epoch': 1.5, 'tokens/total': 1032192.0, 'tokens/trainable': 743308.0} | |
| 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 63/84 [16:17<05:16, 15.09s/it] 76%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 64/84 [16:32<05:01, 15.06s/it] {'loss': 2.2899, 'grad_norm': 0.8282785415649414, 'learning_rate': 7.633587301825173e-06, 'ppl': 9.87395, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 389.5559997558594, 'epoch': 1.52, 'tokens/total': 1048576.0, 'tokens/trainable': 754578.0} | |
| 76%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 64/84 [16:32<05:01, 15.06s/it] 77%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 65/84 [16:47<04:46, 15.07s/it] {'loss': 1.982, 'grad_norm': 0.7019599080085754, 'learning_rate': 7.509881015721476e-06, 'ppl': 7.25724, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 467.1014709472656, 'epoch': 1.55, 'tokens/total': 1064960.0, 'tokens/trainable': 767778.0} | |
| 77%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 65/84 [16:47<04:46, 15.07s/it] 79%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 66/84 [17:02<04:31, 15.06s/it] {'loss': 2.177, 'grad_norm': 0.8358809351921082, 'learning_rate': 7.3770493145275395e-06, 'ppl': 8.81981, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 329.0794982910156, 'epoch': 1.57, 'tokens/total': 1081344.0, 'tokens/trainable': 778410.0} | |
| 79%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 66/84 [17:02<04:31, 15.06s/it] 80%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 67/84 [17:17<04:16, 15.07s/it] {'loss': 1.9627, 'grad_norm': 0.7629905343055725, 'learning_rate': 7.234042186610168e-06, 'ppl': 7.11852, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 318.5070495605469, 'epoch': 1.6, 'tokens/total': 1097728.0, 'tokens/trainable': 790046.0} | |
| 80%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 67/84 [17:17<04:16, 15.07s/it] 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 68/84 [17:32<04:01, 15.09s/it] {'loss': 2.2829, 'grad_norm': 0.7605993151664734, 'learning_rate': 7.079645911289845e-06, 'ppl': 9.80507, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 432.19000244140625, 'epoch': 1.62, 'tokens/total': 1114112.0, 'tokens/trainable': 803491.0} | |
| 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 68/84 [17:32<04:01, 15.09s/it] 82%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 69/84 [17:47<03:46, 15.11s/it] {'loss': 2.1591, 'grad_norm': 0.7556662559509277, 'learning_rate': 6.912442131579155e-06, 'ppl': 8.66334, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 449.40435791015625, 'epoch': 1.64, 'tokens/total': 1130496.0, 'tokens/trainable': 816919.0} | |
| 82%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 69/84 [17:47<03:46, 15.11s/it] 83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 70/84 [18:02<03:31, 15.12s/it] {'loss': 2.0868, 'grad_norm': 0.7916563749313354, 'learning_rate': 6.73076920065796e-06, 'ppl': 8.05908, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.95281982421875, 'epoch': 1.67, 'tokens/total': 1146880.0, 'tokens/trainable': 827854.0} | |
| 83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 70/84 [18:02<03:31, 15.12s/it] 85%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 71/84 [18:18<03:16, 15.14s/it] {'loss': 2.046, 'grad_norm': 0.7467530965805054, 'learning_rate': 6.532663064717781e-06, 'ppl': 7.73689, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.4479675292969, 'epoch': 1.69, 'tokens/total': 1163264.0, 'tokens/trainable': 841071.0} | |
| 85%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 71/84 [18:18<03:16, 15.14s/it] 86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 72/84 [18:33<03:01, 15.15s/it] {'loss': 2.2584, 'grad_norm': 0.769206166267395, 'learning_rate': 6.315789050859166e-06, 'ppl': 9.56777, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 447.147216796875, 'epoch': 1.71, 'tokens/total': 1179648.0, 'tokens/trainable': 853505.0} | |
| 86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 72/84 [18:33<03:01, 15.15s/it] 87%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 73/84 [18:48<02:46, 15.13s/it] {'loss': 2.2368, 'grad_norm': 0.8090022206306458, 'learning_rate': 6.0773477343900595e-06, 'ppl': 9.36332, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 312.2845153808594, 'epoch': 1.74, 'tokens/total': 1196032.0, 'tokens/trainable': 864232.0} | |
| 87%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 73/84 [18:48<02:46, 15.13s/it] 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 74/84 [19:03<02:31, 15.13s/it] {'loss': 2.2596, 'grad_norm': 0.7845657467842102, 'learning_rate': 5.81395306653576e-06, 'ppl': 9.57926, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 400.0311279296875, 'epoch': 1.76, 'tokens/total': 1212416.0, 'tokens/trainable': 876412.0} | |
| 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 74/84 [19:03<02:31, 15.13s/it] 89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 75/84 [19:18<02:15, 15.10s/it] {'loss': 2.0239, 'grad_norm': 0.8282763361930847, 'learning_rate': 5.52147184862406e-06, 'ppl': 7.56778, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 425.2695617675781, 'epoch': 1.79, 'tokens/total': 1228800.0, 'tokens/trainable': 887204.0} | |
| 89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 75/84 [19:18<02:15, 15.10s/it] 90%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 76/84 [19:33<02:00, 15.11s/it] {'loss': 2.0459, 'grad_norm': 0.8150551319122314, 'learning_rate': 5.19480499860947e-06, 'ppl': 7.73612, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 381.22265625, 'epoch': 1.81, 'tokens/total': 1245184.0, 'tokens/trainable': 898698.0} | |
| 90%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 76/84 [19:33<02:00, 15.11s/it] 92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 77/84 [19:48<01:46, 15.16s/it] {'loss': 2.3338, 'grad_norm': 0.8316982388496399, 'learning_rate': 4.82758605357958e-06, 'ppl': 10.31707, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 442.561279296875, 'epoch': 1.83, 'tokens/total': 1261568.0, 'tokens/trainable': 910126.0} | |
| 92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 77/84 [19:48<01:46, 15.16s/it] 93%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 78/84 [20:04<01:30, 15.16s/it] {'loss': 2.3689, 'grad_norm': 0.8292647004127502, 'learning_rate': 4.411764621181646e-06, 'ppl': 10.68563, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 363.56951904296875, 'epoch': 1.86, 'tokens/total': 1277952.0, 'tokens/trainable': 922871.0} | |
| 93%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 78/84 [20:04<01:30, 15.16s/it] 94%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 79/84 [20:19<01:15, 15.17s/it] {'loss': 2.2947, 'grad_norm': 0.7834141850471497, 'learning_rate': 3.937007477361476e-06, 'ppl': 9.92146, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 476.15325927734375, 'epoch': 1.88, 'tokens/total': 1294336.0, 'tokens/trainable': 935425.0} | |
| 94%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 79/84 [20:19<01:15, 15.17s/it] 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 80/84 [20:34<01:00, 15.18s/it] {'loss': 2.0461, 'grad_norm': 0.7962015271186829, 'learning_rate': 3.3898304536705837e-06, 'ppl': 7.73767, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 420.98101806640625, 'epoch': 1.9, 'tokens/total': 1310720.0, 'tokens/trainable': 947062.0} | |
| 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 80/84 [20:34<01:00, 15.18s/it] 96%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 81/84 [20:49<00:45, 15.18s/it] {'loss': 2.2245, 'grad_norm': 0.7918537259101868, 'learning_rate': 2.7522935397428228e-06, 'ppl': 9.24886, 'memory/max_active (GiB)': 20.31, 'memory/max_allocated (GiB)': 20.31, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 347.5594787597656, 'epoch': 1.93, 'tokens/total': 1327104.0, 'tokens/trainable': 959572.0} | |
| 96%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 81/84 [20:49<00:45, 15.18s/it] 98%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 82/84 [21:04<00:30, 15.19s/it] {'loss': 2.4159, 'grad_norm': 0.7803449630737305, 'learning_rate': 1.9999999949504854e-06, 'ppl': 11.19985, 'memory/max_active (GiB)': 20.44, 'memory/max_allocated (GiB)': 20.44, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 441.7668762207031, 'epoch': 1.95, 'tokens/total': 1343488.0, 'tokens/trainable': 973127.0} | |
| 98%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 82/84 [21:04<00:30, 15.19s/it] 99%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 83/84 [21:20<00:15, 15.19s/it] {'loss': 2.13, 'grad_norm': 0.7520664930343628, 'learning_rate': 1.0989010661432985e-06, 'ppl': 8.41487, 'memory/max_active (GiB)': 20.5, 'memory/max_allocated (GiB)': 20.5, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 448.2036437988281, 'epoch': 1.98, 'tokens/total': 1359872.0, 'tokens/trainable': 985757.0} | |
| 99%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 83/84 [21:20<00:15, 15.19s/it] 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 84/84 [21:35<00:00, 15.20s/it] {'loss': 2.2931, 'grad_norm': 0.9194205403327942, 'learning_rate': 0.0, 'ppl': 9.9056, 'memory/max_active (GiB)': 20.43, 'memory/max_allocated (GiB)': 20.43, 'memory/device_reserved (GiB)': 20.93, 'tokens/train_per_sec_per_gpu': 277.64056396484375, 'epoch': 2.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0} | |
| 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 84/84 [21:35<00:00, 15.20s/it][2026-01-04 00:34:20,737] [INFO] [axolotl.core.trainers.base._save:722] [PID:15692] Saving model checkpoint to ./output/checkpoint-84 | |
| {'train_runtime': 1438.4013, 'train_samples_per_second': 0.234, 'train_steps_per_second': 0.058, 'train_loss': 2.380285389366604, 'memory/max_active (GiB)': 15.96, 'memory/max_allocated (GiB)': 15.96, 'memory/device_reserved (GiB)': 20.93, 'epoch': 2.0, 'tokens/train_per_sec_per_gpu': 0.0, 'tokens/total': 1376256.0, 'tokens/trainable': 996638.0} | |
| 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 84/84 [21:36<00:00, 15.20s/it] 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 84/84 [21:36<00:00, 15.44s/it] | |
| [2026-01-04 00:34:22,468] [INFO] [axolotl.train.save_trained_model:233] [PID:15692] Training completed! Saving trained model to ./output. | |
| [2026-01-04 00:34:23,290] [INFO] [axolotl.train.save_trained_model:351] [PID:15692] Model successfully saved to ./output | |
| [0m |