| [2025-11-27 00:21:02,496] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:80269] baseline 0.000GB () | |
| [2025-11-27 00:21:02,496] [INFO] [axolotl.cli.config.load_cfg:248] [PID:80269] config: | |
| { | |
| "activation_offloading": false, | |
| "adapter": "lora", | |
| "axolotl_config_path": "seedcoder.yaml", | |
| "base_model": "ByteDance-Seed/Seed-Coder-8B-Instruct", | |
| "base_model_config": "ByteDance-Seed/Seed-Coder-8B-Instruct", | |
| "batch_size": 128, | |
| "bf16": true, | |
| "capabilities": { | |
| "bf16": true, | |
| "compute_capability": "sm_90", | |
| "fp8": false, | |
| "n_gpu": 8, | |
| "n_node": 1 | |
| }, | |
| "context_parallel_size": 1, | |
| "dataloader_num_workers": 8, | |
| "dataloader_pin_memory": true, | |
| "dataloader_prefetch_factor": 256, | |
| "dataset_num_proc": 208, | |
| "dataset_prepared_path": "last_run_prepared", | |
| "datasets": [ | |
| { | |
| "chat_template": "tokenizer_default", | |
| "field_messages": "messages", | |
| "message_property_mappings": { | |
| "content": "content", | |
| "role": "role" | |
| }, | |
| "path": "new_data_clean.jsonl", | |
| "roles": { | |
| "assistant": [ | |
| "assistant" | |
| ], | |
| "system": [ | |
| "system" | |
| ], | |
| "user": [ | |
| "user" | |
| ] | |
| }, | |
| "trust_remote_code": false, | |
| "type": "chat_template" | |
| } | |
| ], | |
| "ddp": true, | |
| "deepspeed": { | |
| "bf16": { | |
| "enabled": "auto" | |
| }, | |
| "fp16": { | |
| "auto_cast": false, | |
| "enabled": "auto", | |
| "hysteresis": 2, | |
| "initial_scale_power": 32, | |
| "loss_scale": 0, | |
| "loss_scale_window": 1000, | |
| "min_loss_scale": 1 | |
| }, | |
| "gradient_accumulation_steps": "auto", | |
| "gradient_clipping": "auto", | |
| "train_batch_size": "auto", | |
| "train_micro_batch_size_per_gpu": "auto", | |
| "wall_clock_breakdown": false, | |
| "zero_optimization": { | |
| "contiguous_gradients": true, | |
| "offload_optimizer": { | |
| "device": "cpu" | |
| }, | |
| "overlap_comm": true, | |
| "stage": 2 | |
| } | |
| }, | |
| "device": "cuda:0", | |
| "device_map": { | |
| "": 0 | |
| }, | |
| "dion_rank_fraction": 1.0, | |
| "dion_rank_multiple_of": 1, | |
| "env_capabilities": { | |
| "torch_version": "2.8.0" | |
| }, | |
| "eval_batch_size": 16, | |
| "eval_causal_lm_metrics": [ | |
| "sacrebleu", | |
| "comet", | |
| "ter", | |
| "chrf" | |
| ], | |
| "eval_max_new_tokens": 128, | |
| "eval_table_size": 0, | |
| "experimental_skip_move_to_device": true, | |
| "flash_attention": true, | |
| "fp16": false, | |
| "gradient_accumulation_steps": 1, | |
| "gradient_checkpointing": true, | |
| "gradient_checkpointing_kwargs": { | |
| "use_reentrant": false | |
| }, | |
| "group_by_length": false, | |
| "include_tkps": true, | |
| "is_falcon_derived_model": false, | |
| "is_llama_derived_model": true, | |
| "is_mistral_derived_model": false, | |
| "learning_rate": 0.0001, | |
| "liger_fused_linear_cross_entropy": true, | |
| "liger_glu_activation": true, | |
| "liger_layer_norm": true, | |
| "liger_rms_norm": true, | |
| "liger_rope": true, | |
| "lisa_layers_attribute": "model.layers", | |
| "load_best_model_at_end": false, | |
| "load_in_4bit": false, | |
| "load_in_8bit": false, | |
| "local_rank": 0, | |
| "logging_steps": 1, | |
| "lora_alpha": 64, | |
| "lora_dropout": 0.05, | |
| "lora_r": 64, | |
| "lora_target_linear": true, | |
| "loraplus_lr_embedding": 1e-06, | |
| "lr_scheduler": "cosine", | |
| "max_grad_norm": 1.0, | |
| "mean_resizing_embeddings": false, | |
| "micro_batch_size": 16, | |
| "model_config_type": "llama", | |
| "num_epochs": 1.0, | |
| "optimizer": "adamw_torch", | |
| "otel_metrics_host": "localhost", | |
| "otel_metrics_port": 8000, | |
| "output_dir": "./nov262025-sc-LoRA-Run", | |
| "pad_to_sequence_len": true, | |
| "plugins": [ | |
| "axolotl.integrations.liger.LigerPlugin" | |
| ], | |
| "pretrain_multipack_attn": true, | |
| "profiler_steps_start": 0, | |
| "qlora_sharded_model_loading": false, | |
| "ray_num_workers": 1, | |
| "resources_per_worker": { | |
| "GPU": 1 | |
| }, | |
| "sample_packing": false, | |
| "sample_packing_bin_size": 200, | |
| "sample_packing_group_size": 100000, | |
| "save_only_model": false, | |
| "save_safetensors": true, | |
| "save_steps": 60, | |
| "save_total_limit": 100, | |
| "sequence_len": 4096, | |
| "shuffle_before_merging_datasets": false, | |
| "shuffle_merged_datasets": true, | |
| "skip_prepare_dataset": false, | |
| "streaming_multipack_buffer_size": 10000, | |
| "strict": false, | |
| "tensor_parallel_size": 1, | |
| "tf32": false, | |
| "tiled_mlp_use_original_mlp": true, | |
| "tokenizer_config": "ByteDance-Seed/Seed-Coder-8B-Instruct", | |
| "tokenizer_save_jinja_files": true, | |
| "tokenizer_type": "AutoTokenizer", | |
| "torch_dtype": "torch.bfloat16", | |
| "train_on_inputs": false, | |
| "trl": { | |
| "log_completions": false, | |
| "mask_truncated_completions": false, | |
| "ref_model_mixup_alpha": 0.9, | |
| "ref_model_sync_steps": 64, | |
| "scale_rewards": true, | |
| "sync_ref_model": false, | |
| "use_vllm": false, | |
| "vllm_server_host": "0.0.0.0", | |
| "vllm_server_port": 8000 | |
| }, | |
| "trust_remote_code": true, | |
| "type_of_model": "AutoModelForCausalLM", | |
| "use_otel_metrics": false, | |
| "use_ray": false, | |
| "use_wandb": true, | |
| "val_set_size": 0.0, | |
| "vllm": { | |
| "device": "auto", | |
| "dtype": "auto", | |
| "gpu_memory_utilization": 0.9, | |
| "host": "0.0.0.0", | |
| "port": 8000 | |
| }, | |
| "wandb_entity": "test-aa", | |
| "wandb_name": "nov-26-sc-lor-run-1", | |
| "wandb_project": "seedcoder", | |
| "warmup_ratio": 0.05, | |
| "weight_decay": 0.0, | |
| "world_size": 8 | |
| } | |
| [2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]> | |
| [2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]> | |
| [2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]> | |
| [2025-11-27 00:21:04,906] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None | |
| [2025-11-27 00:21:41,317] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:80269] Loading prepared dataset from disk at last_run_prepared/683f1b6addffef1a6c101561a46fc077... | |
| Loading dataset from disk: 0%| | 0/110 [00:00<?, ?it/s] Loading dataset from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 82772.41it/s] | |
| [2025-11-27 00:21:41,594] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:80269] total_num_tokens: 75_959_959 | |
| [2025-11-27 00:21:42,244] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:80269] `total_supervised_tokens: 5_309_191` | |
| [2025-11-27 00:21:42,245] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:80269] total_num_steps: 221 | |
| [2025-11-27 00:21:42,245] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:80269] Maximum number of steps set at 221 | |
| [2025-11-27 00:21:42,270] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:80269] Loading tokenizer... ByteDance-Seed/Seed-Coder-8B-Instruct | |
| [2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:80269] EOS: 2 / <[end▁of▁sentence]> | |
| [2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:80269] BOS: 0 / <[begin▁of▁sentence]> | |
| [2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:80269] PAD: 1 / <[PAD▁TOKEN]> | |
| [2025-11-27 00:21:42,832] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:80269] UNK: None / None | |
| [2025-11-27 00:21:42,832] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:80269] Loading model | |
| [2025-11-27 00:21:42,935] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:80269] Patched Trainer.evaluation_loop with nanmean loss calculation | |
| [2025-11-27 00:21:42,937] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:80269] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation | |
| [2025-11-27 00:21:42,955] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:71] [PID:80269] Applying LIGER to llama with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True} | |
| Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s] Loading checkpoint shards: 25%|████████████████████████████████ | 1/4 [00:01<00:03, 1.31s/it] Loading checkpoint shards: 50%|████████████████████████████████████████████████████████████████ | 2/4 [00:02<00:02, 1.27s/it] Loading checkpoint shards: 75%|████████████████████████████████████████████████████████████████████████████████████████████████ | 3/4 [00:03<00:01, 1.28s/it] Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.07it/s] Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.06s/it] | |
| [2025-11-27 00:22:10,720] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:80269] Converting modules to torch.bfloat16 | |
| [2025-11-27 00:22:10,723] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] Memory usage after model load 18.938GB (+18.938GB allocated, +20.139GB reserved) | |
| [2025-11-27 00:22:10,724] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:80269] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj'] | |
| trainable params: 167,772,160 || all params: 8,418,234,368 || trainable%: 1.9930 | |
| [2025-11-27 00:22:12,070] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:80269] after adapters 16.002GB (+16.002GB allocated, +20.436GB reserved) | |
| [2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:398] [PID:80269] Pre-saving adapter config to ./nov262025-sc-LoRA-Run... | |
| [2025-11-27 00:22:13,617] [INFO] [axolotl.train.save_initial_configs:402] [PID:80269] Pre-saving tokenizer to ./nov262025-sc-LoRA-Run... | |
| [2025-11-27 00:22:13,712] [INFO] [axolotl.train.save_initial_configs:407] [PID:80269] Pre-saving model config to ./nov262025-sc-LoRA-Run... | |
| [2025-11-27 00:22:13,716] [INFO] [axolotl.train.execute_training:196] [PID:80269] Starting trainer... | |
| Time to load cpu_adam op: 2.386819839477539 seconds | |
| [34m[1mwandb[0m: Currently logged in as: [33mpandyamarut[0m ([33mtest-aa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin | |
| [34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()... | |
| [Am[2K [34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()... | |
| [Am[2K [34m[1mwandb[0m: Tracking run with wandb version 0.22.3 | |
| [34m[1mwandb[0m: Run data is saved locally in [35m[1m/osmosis/wandb/run-20251127_002220-5un64tuw[0m | |
| [34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing. | |
| [34m[1mwandb[0m: Syncing run [33mnov-26-sc-lor-run-1[0m | |
| [34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/test-aa/seedcoder[0m | |
| [34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/test-aa/seedcoder/runs/5un64tuw[0m | |
| [34m[1mwandb[0m: Detected [huggingface_hub.inference] in use. | |
| [34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. | |
| [34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/ | |
| [34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") | |
| [2025-11-27 00:22:21,580] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:80269] The Axolotl config has been saved to the WandB run under files. | |
| [2025-11-27 00:22:22,118] [INFO] [axolotl.utils.callbacks.on_train_begin:820] [PID:80269] The DeepSpeed config has been saved to the WandB run under files. | |
| 0%| | 0/221 [00:00<?, ?it/s] 0%|▋ | 1/221 [00:10<37:28, 10.22s/it] {'loss': 0.0756, 'grad_norm': 0.10699598491191864, 'learning_rate': 0.0, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.81, 'tokens_per_second_per_gpu': 336.12, 'epoch': 0.0} | |
| 0%|▋ | 1/221 [00:10<37:28, 10.22s/it] 1%|█▍ | 2/221 [00:17<31:29, 8.63s/it] {'loss': 0.069, 'grad_norm': 0.10141075402498245, 'learning_rate': 9.090909090909091e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 499.67, 'epoch': 0.01} | |
| 1%|█▍ | 2/221 [00:17<31:29, 8.63s/it] 1%|██ | 3/221 [00:24<28:45, 7.92s/it] {'loss': 0.0881, 'grad_norm': 0.11926735192537308, 'learning_rate': 1.8181818181818182e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 511.33, 'epoch': 0.01} | |
| 1%|██ | 3/221 [00:24<28:45, 7.92s/it] 2%|██▊ | 4/221 [00:31<27:22, 7.57s/it] {'loss': 0.0793, 'grad_norm': 0.11467798799276352, 'learning_rate': 2.7272727272727273e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.69, 'epoch': 0.02} | |
| 2%|██▊ | 4/221 [00:31<27:22, 7.57s/it] 2%|███▍ | 5/221 [00:38<26:33, 7.38s/it] {'loss': 0.0894, 'grad_norm': 0.10370815545320511, 'learning_rate': 3.6363636363636364e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 327.47, 'epoch': 0.02} | |
| 2%|███▍ | 5/221 [00:38<26:33, 7.38s/it] 3%|████▏ | 6/221 [00:45<26:02, 7.27s/it] {'loss': 0.0653, 'grad_norm': 0.0778045579791069, 'learning_rate': 4.545454545454546e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 438.39, 'epoch': 0.03} | |
| 3%|████▏ | 6/221 [00:45<26:02, 7.27s/it] 3%|████▊ | 7/221 [00:53<25:48, 7.23s/it] {'loss': 0.0604, 'grad_norm': 0.05470091104507446, 'learning_rate': 5.4545454545454546e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 454.21, 'epoch': 0.03} | |
| 3%|████▊ | 7/221 [00:53<25:48, 7.23s/it] 4%|█████▌ | 8/221 [01:00<25:36, 7.21s/it] {'loss': 0.0575, 'grad_norm': 0.04792458191514015, 'learning_rate': 6.363636363636364e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 381.85, 'epoch': 0.04} | |
| 4%|█████▌ | 8/221 [01:00<25:36, 7.21s/it] 4%|██████▏ | 9/221 [01:07<25:21, 7.18s/it] {'loss': 0.057, 'grad_norm': 0.04809016361832619, 'learning_rate': 7.272727272727273e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 395.18, 'epoch': 0.04} | |
| 4%|██████▏ | 9/221 [01:07<25:21, 7.18s/it] 5%|██████▉ | 10/221 [01:14<25:06, 7.14s/it] {'loss': 0.0472, 'grad_norm': 0.05050504207611084, 'learning_rate': 8.181818181818183e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 437.16, 'epoch': 0.05} | |
| 5%|██████▉ | 10/221 [01:14<25:06, 7.14s/it] 5%|███████▌ | 11/221 [01:21<25:01, 7.15s/it] {'loss': 0.0422, 'grad_norm': 0.057043518871068954, 'learning_rate': 9.090909090909092e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 386.23, 'epoch': 0.05} | |
| 5%|███████▌ | 11/221 [01:21<25:01, 7.15s/it] 5%|████████▎ | 12/221 [01:28<25:00, 7.18s/it] {'loss': 0.033, 'grad_norm': 0.04036800563335419, 'learning_rate': 0.0001, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 309.64, 'epoch': 0.05} | |
| 5%|████████▎ | 12/221 [01:28<25:00, 7.18s/it] 6%|████████▉ | 13/221 [01:35<24:46, 7.15s/it] {'loss': 0.0429, 'grad_norm': 0.03150289133191109, 'learning_rate': 9.999440509051368e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 452.59, 'epoch': 0.06} | |
| 6%|████████▉ | 13/221 [01:35<24:46, 7.15s/it] 6%|█████████▋ | 14/221 [01:43<24:39, 7.15s/it] {'loss': 0.0381, 'grad_norm': 0.03723820298910141, 'learning_rate': 9.997762161417517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 365.29, 'epoch': 0.06} | |
| 6%|█████████▋ | 14/221 [01:43<24:39, 7.15s/it] 7%|██████▋ | 15/221 [01:50<24:29, 7.13s/it] {'loss': 0.0445, 'grad_norm': 0.026561176404356956, 'learning_rate': 9.994965332706573e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 383.32, 'epoch': 0.07} | |
| 7%|██████▋ | 15/221 [01:50<24:29, 7.13s/it] 7%|███████ | 16/221 [01:57<24:21, 7.13s/it] {'loss': 0.0347, 'grad_norm': 0.022102832794189453, 'learning_rate': 9.991050648838675e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 571.37, 'epoch': 0.07} | |
| 7%|███████ | 16/221 [01:57<24:21, 7.13s/it] 8%|███████▌ | 17/221 [02:04<24:15, 7.14s/it] {'loss': 0.0332, 'grad_norm': 0.026612414047122, 'learning_rate': 9.986018985905901e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 371.09, 'epoch': 0.08} | |
| 8%|███████▌ | 17/221 [02:04<24:15, 7.14s/it] 8%|███████▉ | 18/221 [02:11<24:04, 7.11s/it] {'loss': 0.0384, 'grad_norm': 0.02820519544184208, 'learning_rate': 9.979871469976196e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 359.8, 'epoch': 0.08} | |
| 8%|███████▉ | 18/221 [02:11<24:04, 7.11s/it] 9%|████████▍ | 19/221 [02:18<23:55, 7.11s/it] {'loss': 0.0306, 'grad_norm': 0.06290236860513687, 'learning_rate': 9.972609476841367e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 255.21, 'epoch': 0.09} | |
| 9%|████████▍ | 19/221 [02:18<23:55, 7.11s/it] 9%|████████▊ | 20/221 [02:25<23:49, 7.11s/it] {'loss': 0.0328, 'grad_norm': 0.02102799527347088, 'learning_rate': 9.964234631709187e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 388.05, 'epoch': 0.09} | |
| 9%|████████▊ | 20/221 [02:25<23:49, 7.11s/it] 10%|█████████▎ | 21/221 [02:32<23:46, 7.13s/it] {'loss': 0.0336, 'grad_norm': 0.022319750860333443, 'learning_rate': 9.954748808839674e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 496.89, 'epoch': 0.1} | |
| 10%|█████████▎ | 21/221 [02:32<23:46, 7.13s/it] 10%|█████████▊ | 22/221 [02:40<23:41, 7.14s/it] {'loss': 0.0349, 'grad_norm': 0.019568774849176407, 'learning_rate': 9.944154131125642e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 417.97, 'epoch': 0.1} | |
| 10%|█████████▊ | 22/221 [02:40<23:41, 7.14s/it] 10%|██████████▏ | 23/221 [02:47<23:30, 7.12s/it] {'loss': 0.0382, 'grad_norm': 0.04317627474665642, 'learning_rate': 9.932452969617607e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 377.6, 'epoch': 0.1} | |
| 10%|██████████▏ | 23/221 [02:47<23:30, 7.12s/it] 11%|██████████▋ | 24/221 [02:54<23:32, 7.17s/it] {'loss': 0.0361, 'grad_norm': 0.027220861986279488, 'learning_rate': 9.919647942993148e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 468.41, 'epoch': 0.11} | |
| 11%|██████████▋ | 24/221 [02:54<23:32, 7.17s/it] 11%|███████████ | 25/221 [03:01<23:27, 7.18s/it] {'loss': 0.031, 'grad_norm': 0.019090518355369568, 'learning_rate': 9.905741916970864e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 419.96, 'epoch': 0.11} | |
| 11%|███████████ | 25/221 [03:01<23:27, 7.18s/it] 12%|███████████▌ | 26/221 [03:09<23:34, 7.26s/it] {'loss': 0.0316, 'grad_norm': 0.019753405824303627, 'learning_rate': 9.890738003669029e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 385.75, 'epoch': 0.12} | |
| 12%|███████████▌ | 26/221 [03:09<23:34, 7.26s/it] 12%|███████████▉ | 27/221 [03:16<23:29, 7.26s/it] {'loss': 0.0402, 'grad_norm': 0.021183036267757416, 'learning_rate': 9.874639560909117e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 481.45, 'epoch': 0.12} | |
| 12%|███████████▉ | 27/221 [03:16<23:29, 7.26s/it] 13%|████████████▍ | 28/221 [03:23<23:16, 7.23s/it] {'loss': 0.0312, 'grad_norm': 0.018204571679234505, 'learning_rate': 9.857450191464337e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 317.6, 'epoch': 0.13} | |
| 13%|████████████▍ | 28/221 [03:23<23:16, 7.23s/it] 13%|████████████▊ | 29/221 [03:30<23:06, 7.22s/it] {'loss': 0.0343, 'grad_norm': 0.02151501551270485, 'learning_rate': 9.839173742253334e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 471.66, 'epoch': 0.13} | |
| 13%|████████████▊ | 29/221 [03:30<23:06, 7.22s/it] 14%|█████████████▎ | 30/221 [03:37<22:51, 7.18s/it] {'loss': 0.0337, 'grad_norm': 0.021778756752610207, 'learning_rate': 9.819814303479267e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 391.67, 'epoch': 0.14} | |
| 14%|█████████████▎ | 30/221 [03:37<22:51, 7.18s/it] 14%|█████████████▋ | 31/221 [03:45<22:53, 7.23s/it] {'loss': 0.0264, 'grad_norm': 0.01405468862503767, 'learning_rate': 9.799376207714445e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 441.4, 'epoch': 0.14} | |
| 14%|█████████████▋ | 31/221 [03:45<22:53, 7.23s/it] 14%|██████████████▏ | 32/221 [03:52<22:39, 7.19s/it] {'loss': 0.0325, 'grad_norm': 0.0183633491396904, 'learning_rate': 9.777864028930705e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 449.02, 'epoch': 0.14} | |
| 14%|██████████████▏ | 32/221 [03:52<22:39, 7.19s/it] 15%|██████████████▋ | 33/221 [03:59<22:35, 7.21s/it] {'loss': 0.0342, 'grad_norm': 0.022185783833265305, 'learning_rate': 9.755282581475769e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 356.6, 'epoch': 0.15} | |
| 15%|██████████████▋ | 33/221 [03:59<22:35, 7.21s/it] 15%|███████████████ | 34/221 [04:06<22:21, 7.17s/it] {'loss': 0.0255, 'grad_norm': 0.015691177919507027, 'learning_rate': 9.731636918995821e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 461.92, 'epoch': 0.15} | |
| 15%|███████████████ | 34/221 [04:06<22:21, 7.17s/it] 16%|███████████████▌ | 35/221 [04:13<22:14, 7.17s/it] {'loss': 0.0314, 'grad_norm': 0.01962122693657875, 'learning_rate': 9.706932333304517e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 428.5, 'epoch': 0.16} | |
| 16%|███████████████▌ | 35/221 [04:13<22:14, 7.17s/it] 16%|███████████████▉ | 36/221 [04:20<22:05, 7.17s/it] {'loss': 0.0396, 'grad_norm': 0.01783391274511814, 'learning_rate': 9.681174353198687e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 393.65, 'epoch': 0.16} | |
| 16%|███████████████▉ | 36/221 [04:20<22:05, 7.17s/it] 17%|████████████████▍ | 37/221 [04:28<21:57, 7.16s/it] {'loss': 0.0351, 'grad_norm': 0.020520439371466637, 'learning_rate': 9.654368743221022e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 331.18, 'epoch': 0.17} | |
| 17%|████████████████▍ | 37/221 [04:28<21:57, 7.16s/it] 17%|████████████████▊ | 38/221 [04:35<21:47, 7.14s/it] {'loss': 0.0341, 'grad_norm': 0.019169267266988754, 'learning_rate': 9.626521502369984e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.89, 'tokens_per_second_per_gpu': 429.02, 'epoch': 0.17} | |
| 17%|████████████████▊ | 38/221 [04:35<21:47, 7.14s/it] 18%|█████████████████▎ | 39/221 [04:42<21:41, 7.15s/it] {'loss': 0.0351, 'grad_norm': 0.02138075977563858, 'learning_rate': 9.597638862757255e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 491.69, 'epoch': 0.18} | |
| 18%|█████████████████▎ | 39/221 [04:42<21:41, 7.15s/it] 18%|█████████████████▋ | 40/221 [04:49<21:37, 7.17s/it] {'loss': 0.0338, 'grad_norm': 0.0176653191447258, 'learning_rate': 9.567727288213005e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.26, 'epoch': 0.18} | |
| 18%|█████████████████▋ | 40/221 [04:49<21:37, 7.17s/it] 19%|██████████████████▏ | 41/221 [04:56<21:29, 7.17s/it] {'loss': 0.0316, 'grad_norm': 0.017243385314941406, 'learning_rate': 9.536793472839325e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.46, 'epoch': 0.19} | |
| 19%|██████████████████▏ | 41/221 [04:56<21:29, 7.17s/it] 19%|██████████████████▌ | 42/221 [05:03<21:18, 7.14s/it] {'loss': 0.0216, 'grad_norm': 0.0146207669749856, 'learning_rate': 9.504844339512095e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 363.65, 'epoch': 0.19} | |
| 19%|██████████████████▌ | 42/221 [05:03<21:18, 7.14s/it] 19%|███████████████████ | 43/221 [05:10<21:11, 7.14s/it] {'loss': 0.0335, 'grad_norm': 0.017516395077109337, 'learning_rate': 9.471887038331685e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.46, 'epoch': 0.19} | |
| 19%|███████████████████ | 43/221 [05:10<21:11, 7.14s/it] 20%|███████████████████▌ | 44/221 [05:18<21:05, 7.15s/it] {'loss': 0.0313, 'grad_norm': 0.01834929920732975, 'learning_rate': 9.437928945022771e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.41, 'epoch': 0.2} | |
| 20%|███████████████████▌ | 44/221 [05:18<21:05, 7.15s/it] 20%|███████████████████▉ | 45/221 [05:25<20:58, 7.15s/it] {'loss': 0.0312, 'grad_norm': 0.018410420045256615, 'learning_rate': 9.40297765928369e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 414.21, 'epoch': 0.2} | |
| 20%|███████████████████▉ | 45/221 [05:25<20:58, 7.15s/it] 21%|████████████████████▍ | 46/221 [05:32<20:43, 7.11s/it] {'loss': 0.0335, 'grad_norm': 0.043539997190237045, 'learning_rate': 9.367041003085649e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 319.79, 'epoch': 0.21} | |
| 21%|████████████████████▍ | 46/221 [05:32<20:43, 7.11s/it] 21%|████████████████████▊ | 47/221 [05:39<20:40, 7.13s/it] {'loss': 0.0307, 'grad_norm': 0.019783005118370056, 'learning_rate': 9.330127018922194e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 262.29, 'epoch': 0.21} | |
| 21%|████████████████████▊ | 47/221 [05:39<20:40, 7.13s/it] 22%|█████████████████████▎ | 48/221 [05:46<20:31, 7.12s/it] {'loss': 0.0308, 'grad_norm': 0.018382525071501732, 'learning_rate': 9.292243968009331e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.22} | |
| 22%|█████████████████████▎ | 48/221 [05:46<20:31, 7.12s/it] 22%|█████████████████████▋ | 49/221 [05:53<20:25, 7.13s/it] {'loss': 0.0338, 'grad_norm': 0.021564122289419174, 'learning_rate': 9.253400328436699e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 484.09, 'epoch': 0.22} | |
| 22%|█████████████████████▋ | 49/221 [05:53<20:25, 7.13s/it] 23%|██████████████████████▏ | 50/221 [06:00<20:16, 7.12s/it] {'loss': 0.0285, 'grad_norm': 0.016710789874196053, 'learning_rate': 9.213604793270196e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 445.99, 'epoch': 0.23} | |
| 23%|██████████████████████▏ | 50/221 [06:00<20:16, 7.12s/it] 23%|██████████████████████▌ | 51/221 [06:07<20:10, 7.12s/it] {'loss': 0.0293, 'grad_norm': 0.016314025968313217, 'learning_rate': 9.172866268606513e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.09, 'epoch': 0.23} | |
| 23%|██████████████████████▌ | 51/221 [06:07<20:10, 7.12s/it] 24%|███████████████████████ | 52/221 [06:14<20:02, 7.12s/it] {'loss': 0.0273, 'grad_norm': 0.01764376275241375, 'learning_rate': 9.131193871579975e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 558.97, 'epoch': 0.24} | |
| 24%|███████████████████████ | 52/221 [06:14<20:02, 7.12s/it] 24%|███████████████████████▌ | 53/221 [06:21<19:49, 7.08s/it] {'loss': 0.0245, 'grad_norm': 0.01896459050476551, 'learning_rate': 9.088596928322158e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.06, 'epoch': 0.24} | |
| 24%|███████████████████████▌ | 53/221 [06:21<19:49, 7.08s/it] 24%|███████████████████████▉ | 54/221 [06:29<19:48, 7.12s/it] {'loss': 0.0301, 'grad_norm': 0.01687958650290966, 'learning_rate': 9.045084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 296.5, 'epoch': 0.24} | |
| 24%|███████████████████████▉ | 54/221 [06:29<19:48, 7.12s/it] 25%|████████████████████████▍ | 55/221 [06:36<19:41, 7.12s/it] {'loss': 0.0341, 'grad_norm': 0.021479196846485138, 'learning_rate': 9.000667740056032e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.19, 'epoch': 0.25} | |
| 25%|████████████████████████▍ | 55/221 [06:36<19:41, 7.12s/it] 25%|████████████████████████▊ | 56/221 [06:43<19:39, 7.15s/it] {'loss': 0.0271, 'grad_norm': 0.016711527481675148, 'learning_rate': 8.955355173281708e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.2, 'epoch': 0.25} | |
| 25%|████████████████████████▊ | 56/221 [06:43<19:39, 7.15s/it] 26%|█████████████████████████▎ | 57/221 [06:50<19:37, 7.18s/it] {'loss': 0.0287, 'grad_norm': 0.01733219437301159, 'learning_rate': 8.90915741234015e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.43, 'epoch': 0.26} | |
| 26%|█████████████████████████▎ | 57/221 [06:50<19:37, 7.18s/it] 26%|█████████████████████████▋ | 58/221 [06:57<19:27, 7.16s/it] {'loss': 0.0254, 'grad_norm': 0.01601138710975647, 'learning_rate': 8.862084796122998e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 456.58, 'epoch': 0.26} | |
| 26%|█████████████████████████▋ | 58/221 [06:57<19:27, 7.16s/it] 27%|██████████████████████████▏ | 59/221 [07:04<19:16, 7.14s/it] {'loss': 0.0311, 'grad_norm': 0.018599703907966614, 'learning_rate': 8.814147859311332e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.27} | |
| 27%|██████████████████████████▏ | 59/221 [07:04<19:16, 7.14s/it] 27%|██████████████████████████▌ | 60/221 [07:12<19:09, 7.14s/it] {'loss': 0.0309, 'grad_norm': 0.01615062914788723, 'learning_rate': 8.765357330018056e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.15, 'epoch': 0.27} | |
| 27%|██████████████████████████▌ | 60/221 [07:12<19:09, 7.14s/it][2025-11-27 00:29:44,068] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-60 | |
| 28%|███████████████████████████ | 61/221 [07:31<28:50, 10.82s/it] {'loss': 0.0245, 'grad_norm': 0.01592225581407547, 'learning_rate': 8.715724127386972e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.76, 'epoch': 0.28} | |
| 28%|███████████████████████████ | 61/221 [07:31<28:50, 10.82s/it] 28%|███████████████████████████▍ | 62/221 [07:38<25:36, 9.66s/it] {'loss': 0.0295, 'grad_norm': 0.02008405141532421, 'learning_rate': 8.665259359149132e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 447.91, 'epoch': 0.28} | |
| 28%|███████████████████████████▍ | 62/221 [07:38<25:36, 9.66s/it] 29%|███████████████████████████▉ | 63/221 [07:45<23:25, 8.90s/it] {'loss': 0.0366, 'grad_norm': 0.019492069259285927, 'learning_rate': 8.613974319136958e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 405.86, 'epoch': 0.29} | |
| 29%|███████████████████████████▉ | 63/221 [07:45<23:25, 8.90s/it] 29%|████████████████████████████▍ | 64/221 [07:52<21:51, 8.36s/it] {'loss': 0.0284, 'grad_norm': 0.02178225666284561, 'learning_rate': 8.561880484756725e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.25, 'epoch': 0.29} | |
| 29%|████████████████████████████▍ | 64/221 [07:52<21:51, 8.36s/it] 29%|████████████████████████████▊ | 65/221 [07:59<20:46, 7.99s/it] {'loss': 0.0297, 'grad_norm': 0.019038653001189232, 'learning_rate': 8.508989514419958e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 345.76, 'epoch': 0.29} | |
| 29%|████████████████████████████▊ | 65/221 [07:59<20:46, 7.99s/it] 30%|█████████████████████████████▎ | 66/221 [08:07<20:04, 7.77s/it] {'loss': 0.0293, 'grad_norm': 0.01684654876589775, 'learning_rate': 8.455313244934324e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.88, 'epoch': 0.3} | |
| 30%|█████████████████████████████▎ | 66/221 [08:07<20:04, 7.77s/it] 30%|█████████████████████████████▋ | 67/221 [08:14<19:25, 7.57s/it] {'loss': 0.0239, 'grad_norm': 0.01636500470340252, 'learning_rate': 8.400863688854597e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.77, 'epoch': 0.3} | |
| 30%|█████████████████████████████▋ | 67/221 [08:14<19:25, 7.57s/it] 31%|██████████████████████████████▏ | 68/221 [08:21<18:56, 7.43s/it] {'loss': 0.0263, 'grad_norm': 0.020848819985985756, 'learning_rate': 8.345653031794292e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 365.36, 'epoch': 0.31} | |
| 31%|██████████████████████████████▏ | 68/221 [08:21<18:56, 7.43s/it] 31%|██████████████████████████████▌ | 69/221 [08:28<18:33, 7.33s/it] {'loss': 0.0355, 'grad_norm': 0.02269025892019272, 'learning_rate': 8.289693629698564e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.76, 'epoch': 0.31} | |
| 31%|██████████████████████████████▌ | 69/221 [08:28<18:33, 7.33s/it] 32%|███████████████████████████████ | 70/221 [08:35<18:16, 7.26s/it] {'loss': 0.0284, 'grad_norm': 0.01883563958108425, 'learning_rate': 8.232998006078997e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.65, 'epoch': 0.32} | |
| 32%|███████████████████████████████ | 70/221 [08:35<18:16, 7.26s/it] 32%|███████████████████████████████▍ | 71/221 [08:42<17:58, 7.19s/it] {'loss': 0.0261, 'grad_norm': 0.017690833657979965, 'learning_rate': 8.175578849210895e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 441.77, 'epoch': 0.32} | |
| 32%|███████████████████████████████▍ | 71/221 [08:42<17:58, 7.19s/it] 33%|███████████████████████████████▉ | 72/221 [08:49<17:49, 7.17s/it] {'loss': 0.0282, 'grad_norm': 0.018213583156466484, 'learning_rate': 8.117449009293668e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 413.18, 'epoch': 0.33} | |
| 33%|███████████████████████████████▉ | 72/221 [08:49<17:49, 7.17s/it] 33%|████████████████████████████████▎ | 73/221 [08:56<17:36, 7.14s/it] {'loss': 0.0309, 'grad_norm': 0.019336581230163574, 'learning_rate': 8.058621495575032e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.23, 'epoch': 0.33} | |
| 33%|████████████████████████████████▎ | 73/221 [08:56<17:36, 7.14s/it] 33%|████████████████████████████████▊ | 74/221 [09:03<17:25, 7.11s/it] {'loss': 0.0312, 'grad_norm': 0.019243910908699036, 'learning_rate': 7.999109473439569e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.43, 'epoch': 0.33} | |
| 33%|████████████████████████████████▊ | 74/221 [09:03<17:25, 7.11s/it] 34%|█████████████████████████████████▎ | 75/221 [09:10<17:17, 7.11s/it] {'loss': 0.0317, 'grad_norm': 0.01973150111734867, 'learning_rate': 7.938926261462366e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.63, 'epoch': 0.34} | |
| 34%|█████████████████████████████████▎ | 75/221 [09:10<17:17, 7.11s/it] 34%|█████████████████████████████████▋ | 76/221 [09:18<17:15, 7.14s/it] {'loss': 0.0293, 'grad_norm': 0.02182990498840809, 'learning_rate': 7.878085328428369e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.11, 'epoch': 0.34} | |
| 34%|█████████████████████████████████▋ | 76/221 [09:18<17:15, 7.14s/it] 35%|██████████████████████████████████▏ | 77/221 [09:25<17:06, 7.13s/it] {'loss': 0.0287, 'grad_norm': 0.018669869750738144, 'learning_rate': 7.81660029031811e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 408.44, 'epoch': 0.35} | |
| 35%|██████████████████████████████████▏ | 77/221 [09:25<17:06, 7.13s/it] 35%|██████████████████████████████████▌ | 78/221 [09:32<16:59, 7.13s/it] {'loss': 0.0315, 'grad_norm': 0.016969047486782074, 'learning_rate': 7.754484907260513e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.64, 'epoch': 0.35} | |
| 35%|██████████████████████████████████▌ | 78/221 [09:32<16:59, 7.13s/it] 36%|███████████████████████████████████ | 79/221 [09:39<16:48, 7.10s/it] {'loss': 0.0278, 'grad_norm': 0.019713636487722397, 'learning_rate': 7.691753080453412e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 397.76, 'epoch': 0.36} | |
| 36%|███████████████████████████████████ | 79/221 [09:39<16:48, 7.10s/it] 36%|███████████████████████████████████▍ | 80/221 [09:46<16:45, 7.13s/it] {'loss': 0.0255, 'grad_norm': 0.017600620165467262, 'learning_rate': 7.628418849052523e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.33, 'epoch': 0.36} | |
| 36%|███████████████████████████████████▍ | 80/221 [09:46<16:45, 7.13s/it] 37%|███████████████████████████████████▉ | 81/221 [09:53<16:36, 7.12s/it] {'loss': 0.0234, 'grad_norm': 0.018615400418639183, 'learning_rate': 7.564496387029532e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 352.61, 'epoch': 0.37} | |
| 37%|███████████████████████████████████▉ | 81/221 [09:53<16:36, 7.12s/it] 37%|████████████████████████████████████▎ | 82/221 [10:00<16:27, 7.11s/it] {'loss': 0.0234, 'grad_norm': 0.023312179371714592, 'learning_rate': 7.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 499.51, 'epoch': 0.37} | |
| 37%|████████████████████████████████████▎ | 82/221 [10:00<16:27, 7.11s/it] 38%|████████████████████████████████████▊ | 83/221 [10:07<16:23, 7.13s/it] {'loss': 0.0321, 'grad_norm': 0.01922520436346531, 'learning_rate': 7.434944122021836e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 419.08, 'epoch': 0.38} | |
| 38%|████████████████████████████████████▊ | 83/221 [10:07<16:23, 7.13s/it] 38%|█████████████████████████████████████▏ | 84/221 [10:14<16:11, 7.09s/it] {'loss': 0.0254, 'grad_norm': 0.019153179600834846, 'learning_rate': 7.369343312364993e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.78, 'epoch': 0.38} | |
| 38%|█████████████████████████████████████▏ | 84/221 [10:14<16:11, 7.09s/it] 38%|█████████████████████████████████████▋ | 85/221 [10:22<16:12, 7.15s/it] {'loss': 0.0343, 'grad_norm': 0.020285822451114655, 'learning_rate': 7.303212252253162e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 371.99, 'epoch': 0.38} | |
| 38%|█████████████████████████████████████▋ | 85/221 [10:22<16:12, 7.15s/it] 39%|██████████████████████████████████████▏ | 86/221 [10:29<15:59, 7.11s/it] {'loss': 0.0249, 'grad_norm': 0.016675548627972603, 'learning_rate': 7.236565741578163e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.7, 'epoch': 0.39} | |
| 39%|██████████████████████████████████████▏ | 86/221 [10:29<15:59, 7.11s/it] 39%|██████████████████████████████████████▌ | 87/221 [10:36<15:59, 7.16s/it] {'loss': 0.0289, 'grad_norm': 0.015159820206463337, 'learning_rate': 7.169418695587791e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 344.64, 'epoch': 0.39} | |
| 39%|██████████████████████████████████████▌ | 87/221 [10:36<15:59, 7.16s/it] 40%|███████████████████████████████████████ | 88/221 [10:43<15:52, 7.16s/it] {'loss': 0.0276, 'grad_norm': 0.018055099993944168, 'learning_rate': 7.101786141547828e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 375.32, 'epoch': 0.4} | |
| 40%|███████████████████████████████████████ | 88/221 [10:43<15:52, 7.16s/it] 40%|███████████████████████████████████████▍ | 89/221 [10:50<15:42, 7.14s/it] {'loss': 0.0277, 'grad_norm': 0.01955697126686573, 'learning_rate': 7.033683215379002e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.83, 'epoch': 0.4} | |
| 40%|███████████████████████████████████████▍ | 89/221 [10:50<15:42, 7.14s/it] 41%|███████████████████████████████████████▉ | 90/221 [10:57<15:36, 7.15s/it] {'loss': 0.0277, 'grad_norm': 0.01860162802040577, 'learning_rate': 6.965125158269619e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.84, 'epoch': 0.41} | |
| 41%|███████████████████████████████████████▉ | 90/221 [10:57<15:36, 7.15s/it] 41%|████████████████████████████████████████▎ | 91/221 [11:04<15:25, 7.12s/it] {'loss': 0.0322, 'grad_norm': 0.02057529240846634, 'learning_rate': 6.896127313264643e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.85, 'epoch': 0.41} | |
| 41%|████████████████████████████████████████▎ | 91/221 [11:04<15:25, 7.12s/it] 42%|████████████████████████████████████████▊ | 92/221 [11:11<15:15, 7.09s/it] {'loss': 0.0228, 'grad_norm': 0.017251698300242424, 'learning_rate': 6.826705121831976e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 483.38, 'epoch': 0.42} | |
| 42%|████████████████████████████████████████▊ | 92/221 [11:11<15:15, 7.09s/it] 42%|█████████████████████████████████████████▏ | 93/221 [11:19<15:21, 7.20s/it] {'loss': 0.028, 'grad_norm': 0.017092842608690262, 'learning_rate': 6.756874120406714e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.82, 'epoch': 0.42} | |
| 42%|█████████████████████████████████████████▏ | 93/221 [11:19<15:21, 7.20s/it] 43%|█████████████████████████████████████████▋ | 94/221 [11:26<15:15, 7.21s/it] {'loss': 0.0272, 'grad_norm': 0.01863975077867508, 'learning_rate': 6.686649936914152e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 455.96, 'epoch': 0.43} | |
| 43%|█████████████████████████████████████████▋ | 94/221 [11:26<15:15, 7.21s/it] 43%|██████████████████████████████████████████▏ | 95/221 [11:33<15:06, 7.19s/it] {'loss': 0.0256, 'grad_norm': 0.019126810133457184, 'learning_rate': 6.616048287272301e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.29, 'epoch': 0.43} | |
| 43%|██████████████████████████████████████████▏ | 95/221 [11:33<15:06, 7.19s/it] 43%|██████████████████████████████████████████▌ | 96/221 [11:40<14:59, 7.20s/it] {'loss': 0.0293, 'grad_norm': 0.019856387749314308, 'learning_rate': 6.545084971874738e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 330.99, 'epoch': 0.43} | |
| 43%|██████████████████████████████████████████▌ | 96/221 [11:40<14:59, 7.20s/it] 44%|███████████████████████████████████████████ | 97/221 [11:48<14:48, 7.17s/it] {'loss': 0.0298, 'grad_norm': 0.020938578993082047, 'learning_rate': 6.473775872054521e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 385.5, 'epoch': 0.44} | |
| 44%|███████████████████████████████████████████ | 97/221 [11:48<14:48, 7.17s/it] 44%|███████████████████████████████████████████▍ | 98/221 [11:55<14:41, 7.17s/it] {'loss': 0.0213, 'grad_norm': 0.01743321865797043, 'learning_rate': 6.402136946530014e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.25, 'epoch': 0.44} | |
| 44%|███████████████████████████████████████████▍ | 98/221 [11:55<14:41, 7.17s/it] 45%|███████████████████████████████████████████▉ | 99/221 [12:02<14:37, 7.19s/it] {'loss': 0.0289, 'grad_norm': 0.03026910126209259, 'learning_rate': 6.330184227833376e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.35, 'epoch': 0.45} | |
| 45%|███████████████████████████████████████████▉ | 99/221 [12:02<14:37, 7.19s/it] 45%|███████████████████████████████████████████▉ | 100/221 [12:09<14:22, 7.13s/it] {'loss': 0.0255, 'grad_norm': 0.021303489804267883, 'learning_rate': 6.257933818722543e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 508.68, 'epoch': 0.45} | |
| 45%|███████████████████████████████████████████▉ | 100/221 [12:09<14:22, 7.13s/it] 46%|████████████████████████████████████████████▎ | 101/221 [12:16<14:14, 7.12s/it] {'loss': 0.0302, 'grad_norm': 0.018962478265166283, 'learning_rate': 6.185401888577488e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.8, 'epoch': 0.46} | |
| 46%|████████████████████████████████████████████▎ | 101/221 [12:16<14:14, 7.12s/it] 46%|████████████████████████████████████████████▊ | 102/221 [12:23<14:09, 7.14s/it] {'loss': 0.0233, 'grad_norm': 0.01941424049437046, 'learning_rate': 6.112604669781572e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 468.02, 'epoch': 0.46} | |
| 46%|████████████████████████████████████████████▊ | 102/221 [12:23<14:09, 7.14s/it] 47%|█████████████████████████████████████████████▏ | 103/221 [12:30<14:02, 7.14s/it] {'loss': 0.0295, 'grad_norm': 0.019837241619825363, 'learning_rate': 6.0395584540887963e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 429.34, 'epoch': 0.47} | |
| 47%|█████████████████████████████████████████████▏ | 103/221 [12:30<14:02, 7.14s/it] 47%|█████████████████████████████████████████████▋ | 104/221 [12:38<13:54, 7.13s/it] {'loss': 0.0259, 'grad_norm': 0.018758604303002357, 'learning_rate': 5.9662795889777666e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 337.38, 'epoch': 0.47} | |
| 47%|█████████████████████████████████████████████▋ | 104/221 [12:38<13:54, 7.13s/it] 48%|██████████████████████████████████████████████ | 105/221 [12:45<13:43, 7.10s/it] {'loss': 0.0274, 'grad_norm': 0.01769891194999218, 'learning_rate': 5.8927844739931834e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.82, 'epoch': 0.48} | |
| 48%|██████████████████████████████████████████████ | 105/221 [12:45<13:43, 7.10s/it] 48%|██████████████████████████████████████████████▌ | 106/221 [12:52<13:41, 7.14s/it] {'loss': 0.0265, 'grad_norm': 0.017575478181242943, 'learning_rate': 5.819089557075689e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 473.38, 'epoch': 0.48} | |
| 48%|██████████████████████████████████████████████▌ | 106/221 [12:52<13:41, 7.14s/it] 48%|██████████████████████████████████████████████▉ | 107/221 [12:59<13:30, 7.11s/it] {'loss': 0.0261, 'grad_norm': 0.017795337364077568, 'learning_rate': 5.745211330880872e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.08, 'epoch': 0.48} | |
| 48%|██████████████████████████████████████████████▉ | 107/221 [12:59<13:30, 7.11s/it] 49%|███████████████████████████████████████████████▍ | 108/221 [13:06<13:21, 7.09s/it] {'loss': 0.0312, 'grad_norm': 0.021093547344207764, 'learning_rate': 5.6711663290882776e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.72, 'epoch': 0.49} | |
| 49%|███████████████████████████████████████████████▍ | 108/221 [13:06<13:21, 7.09s/it] 49%|███████████████████████████████████████████████▊ | 109/221 [13:13<13:10, 7.06s/it] {'loss': 0.0238, 'grad_norm': 0.022809553891420364, 'learning_rate': 5.596971122701221e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 504.0, 'epoch': 0.49} | |
| 49%|███████████████████████████████████████████████▊ | 109/221 [13:13<13:10, 7.06s/it] 50%|████████████████████████████████████████████████▎ | 110/221 [13:20<13:05, 7.08s/it] {'loss': 0.0267, 'grad_norm': 0.018646899610757828, 'learning_rate': 5.522642316338268e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.5} | |
| 50%|████████████████████████████████████████████████▎ | 110/221 [13:20<13:05, 7.08s/it] 50%|████████████████████████████████████████████████▋ | 111/221 [13:27<13:01, 7.10s/it] {'loss': 0.0253, 'grad_norm': 0.0172793660312891, 'learning_rate': 5.448196544517168e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.68, 'epoch': 0.5} | |
| 50%|████████████████████████████████████████████████▋ | 111/221 [13:27<13:01, 7.10s/it] 51%|█████████████████████████████████████████████████▏ | 112/221 [13:34<12:57, 7.14s/it] {'loss': 0.0335, 'grad_norm': 0.019996505230665207, 'learning_rate': 5.373650467932122e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 513.18, 'epoch': 0.51} | |
| 51%|█████████████████████████████████████████████████▏ | 112/221 [13:34<12:57, 7.14s/it] 51%|█████████████████████████████████████████████████▌ | 113/221 [13:42<12:51, 7.14s/it] {'loss': 0.0333, 'grad_norm': 0.017304031178355217, 'learning_rate': 5.299020769725172e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.94, 'epoch': 0.51} | |
| 51%|█████████████████████████████████████████████████▌ | 113/221 [13:42<12:51, 7.14s/it] 52%|██████████████████████████████████████████████████ | 114/221 [13:49<12:43, 7.13s/it] {'loss': 0.027, 'grad_norm': 0.01827949658036232, 'learning_rate': 5.2243241517525754e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 286.28, 'epoch': 0.52} | |
| 52%|██████████████████████████████████████████████████ | 114/221 [13:49<12:43, 7.13s/it] 52%|██████████████████████████████████████████████████▍ | 115/221 [13:56<12:34, 7.12s/it] {'loss': 0.0252, 'grad_norm': 0.018098153173923492, 'learning_rate': 5.149577330846993e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.73, 'epoch': 0.52} | |
| 52%|██████████████████████████████████████████████████▍ | 115/221 [13:56<12:34, 7.12s/it] 52%|██████████████████████████████████████████████████▉ | 116/221 [14:03<12:28, 7.13s/it] {'loss': 0.0229, 'grad_norm': 0.015578909777104855, 'learning_rate': 5.074797035076319e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.63, 'epoch': 0.52} | |
| 52%|██████████████████████████████████████████████████▉ | 116/221 [14:03<12:28, 7.13s/it] 53%|███████████████████████████████████████████████████▎ | 117/221 [14:10<12:22, 7.14s/it] {'loss': 0.0283, 'grad_norm': 0.01797802932560444, 'learning_rate': 5e-05, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.61, 'epoch': 0.53} | |
| 53%|███████████████████████████████████████████████████▎ | 117/221 [14:10<12:22, 7.14s/it] 53%|███████████████████████████████████████████████████▊ | 118/221 [14:17<12:14, 7.13s/it] {'loss': 0.0259, 'grad_norm': 0.018971417099237442, 'learning_rate': 4.925202964923683e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.12, 'epoch': 0.53} | |
| 53%|███████████████████████████████████████████████████▊ | 118/221 [14:17<12:14, 7.13s/it] 54%|████████████████████████████████████████████████████▏ | 119/221 [14:24<12:09, 7.15s/it] {'loss': 0.0265, 'grad_norm': 0.019693924114108086, 'learning_rate': 4.850422669153009e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 549.79, 'epoch': 0.54} | |
| 54%|████████████████████████████████████████████████████▏ | 119/221 [14:24<12:09, 7.15s/it] 54%|████████████████████████████████████████████████████▋ | 120/221 [14:31<12:02, 7.15s/it] {'loss': 0.0277, 'grad_norm': 0.020947441458702087, 'learning_rate': 4.775675848247427e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.17, 'epoch': 0.54} | |
| 54%|████████████████████████████████████████████████████▋ | 120/221 [14:31<12:02, 7.15s/it][2025-11-27 00:37:03,937] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-120 | |
| 55%|█████████████████████████████████████████████████████ | 121/221 [14:51<17:59, 10.80s/it] {'loss': 0.0249, 'grad_norm': 0.01684478297829628, 'learning_rate': 4.700979230274829e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 399.17, 'epoch': 0.55} | |
| 55%|█████████████████████████████████████████████████████ | 121/221 [14:51<17:59, 10.80s/it] 55%|█████████████████████████████████████████████████████▌ | 122/221 [14:58<16:01, 9.72s/it] {'loss': 0.0289, 'grad_norm': 0.019412320107221603, 'learning_rate': 4.626349532067879e-05, 'memory/max_active (GiB)': 49.08, 'memory/max_allocated (GiB)': 49.08, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 450.57, 'epoch': 0.55} | |
| 55%|█████████████████████████████████████████████████████▌ | 122/221 [14:58<16:01, 9.72s/it] 56%|█████████████████████████████████████████████████████▉ | 123/221 [15:05<14:31, 8.89s/it] {'loss': 0.0246, 'grad_norm': 0.018697615712881088, 'learning_rate': 4.551803455482833e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 310.05, 'epoch': 0.56} | |
| 56%|█████████████████████████████████████████████████████▉ | 123/221 [15:05<14:31, 8.89s/it] 56%|██████████████████████████████████████████████████████▍ | 124/221 [15:12<13:31, 8.36s/it] {'loss': 0.0245, 'grad_norm': 0.01853892020881176, 'learning_rate': 4.477357683661734e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 448.58, 'epoch': 0.56} | |
| 56%|██████████████████████████████████████████████████████▍ | 124/221 [15:12<13:31, 8.36s/it] 57%|██████████████████████████████████████████████████████▊ | 125/221 [15:19<12:44, 7.96s/it] {'loss': 0.0257, 'grad_norm': 0.017834430560469627, 'learning_rate': 4.403028877298779e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 379.27, 'epoch': 0.57} | |
| 57%|██████████████████████████████████████████████████████▊ | 125/221 [15:19<12:44, 7.96s/it] 57%|███████████████████████████████████████████████████████▎ | 126/221 [15:26<12:16, 7.75s/it] {'loss': 0.027, 'grad_norm': 0.018574297428131104, 'learning_rate': 4.328833670911724e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.42, 'epoch': 0.57} | |
| 57%|███████████████████████████████████████████████████████▎ | 126/221 [15:26<12:16, 7.75s/it] 57%|███████████████████████████████████████████████████████▋ | 127/221 [15:34<11:51, 7.57s/it] {'loss': 0.0333, 'grad_norm': 0.019024794921278954, 'learning_rate': 4.254788669119127e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 458.35, 'epoch': 0.57} | |
| 57%|███████████████████████████████████████████████████████▋ | 127/221 [15:34<11:51, 7.57s/it] 58%|████████████████████████████████████████████████████████▏ | 128/221 [15:41<11:30, 7.43s/it] {'loss': 0.0247, 'grad_norm': 0.018406303599476814, 'learning_rate': 4.180910442924312e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.39, 'epoch': 0.58} | |
| 58%|████████████████████████████████████████████████████████▏ | 128/221 [15:41<11:30, 7.43s/it] 58%|████████████████████████████████████████████████████████▌ | 129/221 [15:48<11:15, 7.34s/it] {'loss': 0.0217, 'grad_norm': 0.016275746747851372, 'learning_rate': 4.107215526006817e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 485.26, 'epoch': 0.58} | |
| 58%|████████████████████████████████████████████████████████▌ | 129/221 [15:48<11:15, 7.34s/it] 59%|█████████████████████████████████████████████████████████ | 130/221 [15:55<10:59, 7.25s/it] {'loss': 0.0284, 'grad_norm': 0.018617160618305206, 'learning_rate': 4.0337204110222346e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 409.41, 'epoch': 0.59} | |
| 59%|█████████████████████████████████████████████████████████ | 130/221 [15:55<10:59, 7.25s/it] 59%|█████████████████████████████████████████████████████████▍ | 131/221 [16:02<10:48, 7.20s/it] {'loss': 0.0216, 'grad_norm': 0.01993851736187935, 'learning_rate': 3.960441545911204e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 391.14, 'epoch': 0.59} | |
| 59%|█████████████████████████████████████████████████████████▍ | 131/221 [16:02<10:48, 7.20s/it] 60%|█████████████████████████████████████████████████████████▉ | 132/221 [16:09<10:38, 7.17s/it] {'loss': 0.025, 'grad_norm': 0.017185868695378304, 'learning_rate': 3.887395330218429e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 476.36, 'epoch': 0.6} | |
| 60%|█████████████████████████████████████████████████████████▉ | 132/221 [16:09<10:38, 7.17s/it] 60%|██████████████████████████████████████████████████████████▍ | 133/221 [16:16<10:25, 7.11s/it] {'loss': 0.0264, 'grad_norm': 0.018661336973309517, 'learning_rate': 3.814598111422513e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 417.55, 'epoch': 0.6} | |
| 60%|██████████████████████████████████████████████████████████▍ | 133/221 [16:16<10:25, 7.11s/it] 61%|██████████████████████████████████████████████████████████▊ | 134/221 [16:23<10:18, 7.10s/it] {'loss': 0.0274, 'grad_norm': 0.022303791716694832, 'learning_rate': 3.742066181277458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.2, 'epoch': 0.61} | |
| 61%|██████████████████████████████████████████████████████████▊ | 134/221 [16:23<10:18, 7.10s/it] 61%|███████████████████████████████████████████████████████████▎ | 135/221 [16:30<10:15, 7.16s/it] {'loss': 0.0217, 'grad_norm': 0.0167496707290411, 'learning_rate': 3.6698157721666246e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 373.07, 'epoch': 0.61} | |
| 61%|███████████████████████████████████████████████████████████▎ | 135/221 [16:30<10:15, 7.16s/it] 62%|███████████████████████████████████████████████████████████▋ | 136/221 [16:37<10:08, 7.16s/it] {'loss': 0.0219, 'grad_norm': 0.016045598313212395, 'learning_rate': 3.597863053469987e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 463.19, 'epoch': 0.62} | |
| 62%|███████████████████████████████████████████████████████████▋ | 136/221 [16:37<10:08, 7.16s/it] 62%|████████████████████████████████████████████████████████████▏ | 137/221 [16:45<09:59, 7.13s/it] {'loss': 0.027, 'grad_norm': 0.017510127276182175, 'learning_rate': 3.5262241279454785e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 440.64, 'epoch': 0.62} | |
| 62%|████████████████████████████████████████████████████████████▏ | 137/221 [16:45<09:59, 7.13s/it] 62%|████████████████████████████████████████████████████████████▌ | 138/221 [16:52<09:54, 7.16s/it] {'loss': 0.0308, 'grad_norm': 0.02389226295053959, 'learning_rate': 3.4549150281252636e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.1, 'epoch': 0.62} | |
| 62%|████████████████████████████████████████████████████████████▌ | 138/221 [16:52<09:54, 7.16s/it] 63%|█████████████████████████████████████████████████████████████ | 139/221 [16:59<09:50, 7.20s/it] {'loss': 0.0275, 'grad_norm': 0.01793692260980606, 'learning_rate': 3.383951712727701e-05, 'memory/max_active (GiB)': 49.04, 'memory/max_allocated (GiB)': 49.04, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.85, 'epoch': 0.63} | |
| 63%|█████████████████████████████████████████████████████████████ | 139/221 [16:59<09:50, 7.20s/it] 63%|█████████████████████████████████████████████████████████████▍ | 140/221 [17:06<09:39, 7.16s/it] {'loss': 0.0294, 'grad_norm': 0.02539198286831379, 'learning_rate': 3.313350063085851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 462.2, 'epoch': 0.63} | |
| 63%|█████████████████████████████████████████████████████████████▍ | 140/221 [17:06<09:39, 7.16s/it] 64%|█████████████████████████████████████████████████████████████▉ | 141/221 [17:13<09:30, 7.13s/it] {'loss': 0.0273, 'grad_norm': 0.019213683903217316, 'learning_rate': 3.243125879593286e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.57, 'epoch': 0.64} | |
| 64%|█████████████████████████████████████████████████████████████▉ | 141/221 [17:13<09:30, 7.13s/it] 64%|██████████████████████████████████████████████████████████████▎ | 142/221 [17:20<09:21, 7.11s/it] {'loss': 0.0357, 'grad_norm': 0.022743066772818565, 'learning_rate': 3.173294878168025e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.85, 'epoch': 0.64} | |
| 64%|██████████████████████████████████████████████████████████████▎ | 142/221 [17:20<09:21, 7.11s/it] 65%|██████████████████████████████████████████████████████████████▊ | 143/221 [17:27<09:17, 7.14s/it] {'loss': 0.0258, 'grad_norm': 0.018310556188225746, 'learning_rate': 3.103872686735358e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 307.54, 'epoch': 0.65} | |
| 65%|██████████████████████████████████████████████████████████████▊ | 143/221 [17:27<09:17, 7.14s/it] 65%|███████████████████████████████████████████████████████████████▏ | 144/221 [17:35<09:08, 7.12s/it] {'loss': 0.0265, 'grad_norm': 0.01981915533542633, 'learning_rate': 3.0348748417303823e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 489.51, 'epoch': 0.65} | |
| 65%|███████████████████████████████████████████████████████████████▏ | 144/221 [17:35<09:08, 7.12s/it] 66%|███████████████████████████████████████████████████████████████▋ | 145/221 [17:42<08:58, 7.08s/it] {'loss': 0.032, 'grad_norm': 0.01881423592567444, 'learning_rate': 2.9663167846209998e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.97, 'epoch': 0.66} | |
| 66%|███████████████████████████████████████████████████████████████▋ | 145/221 [17:42<08:58, 7.08s/it] 66%|████████████████████████████████████████████████████████████████ | 146/221 [17:49<08:52, 7.10s/it] {'loss': 0.029, 'grad_norm': 0.017727544531226158, 'learning_rate': 2.8982138584521735e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 475.56, 'epoch': 0.66} | |
| 66%|████████████████████████████████████████████████████████████████ | 146/221 [17:49<08:52, 7.10s/it] 67%|████████████████████████████████████████████████████████████████▌ | 147/221 [17:56<08:45, 7.10s/it] {'loss': 0.0292, 'grad_norm': 0.01951543428003788, 'learning_rate': 2.8305813044122097e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 376.39, 'epoch': 0.67} | |
| 67%|████████████████████████████████████████████████████████████████▌ | 147/221 [17:56<08:45, 7.10s/it] 67%|████████████████████████████████████████████████████████████████▉ | 148/221 [18:03<08:37, 7.09s/it] {'loss': 0.0272, 'grad_norm': 0.018047522753477097, 'learning_rate': 2.7634342584218365e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 514.29, 'epoch': 0.67} | |
| 67%|████████████████████████████████████████████████████████████████▉ | 148/221 [18:03<08:37, 7.09s/it] 67%|█████████████████████████████████████████████████████████████████▍ | 149/221 [18:10<08:35, 7.16s/it] {'loss': 0.0295, 'grad_norm': 0.020815616473555565, 'learning_rate': 2.6967877477468397e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.84, 'epoch': 0.67} | |
| 67%|█████████████████████████████████████████████████████████████████▍ | 149/221 [18:10<08:35, 7.16s/it] 68%|█████████████████████████████████████████████████████████████████▊ | 150/221 [18:17<08:26, 7.13s/it] {'loss': 0.0303, 'grad_norm': 0.02059975638985634, 'learning_rate': 2.630656687635007e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 424.18, 'epoch': 0.68} | |
| 68%|█████████████████████████████████████████████████████████████████▊ | 150/221 [18:17<08:26, 7.13s/it] 68%|██████████████████████████████████████████████████████████████████▎ | 151/221 [18:24<08:19, 7.13s/it] {'loss': 0.0297, 'grad_norm': 0.019998600706458092, 'learning_rate': 2.5650558779781635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 449.86, 'epoch': 0.68} | |
| 68%|██████████████████████████████████████████████████████████████████▎ | 151/221 [18:24<08:19, 7.13s/it] 69%|██████████████████████████████████████████████████████████████████▋ | 152/221 [18:32<08:15, 7.19s/it] {'loss': 0.033, 'grad_norm': 0.022024452686309814, 'learning_rate': 2.500000000000001e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 420.49, 'epoch': 0.69} | |
| 69%|██████████████████████████████████████████████████████████████████▋ | 152/221 [18:32<08:15, 7.19s/it] 69%|███████████████████████████████████████████████████████████████████▏ | 153/221 [18:39<08:13, 7.26s/it] {'loss': 0.0278, 'grad_norm': 0.019334938377141953, 'learning_rate': 2.43550361297047e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 362.99, 'epoch': 0.69} | |
| 69%|███████████████████████████████████████████████████████████████████▏ | 153/221 [18:39<08:13, 7.26s/it] 70%|███████████████████████████████████████████████████████████████████▌ | 154/221 [18:46<08:04, 7.23s/it] {'loss': 0.0294, 'grad_norm': 0.02994287945330143, 'learning_rate': 2.371581150947476e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.31, 'epoch': 0.7} | |
| 70%|███████████████████████████████████████████████████████████████████▌ | 154/221 [18:46<08:04, 7.23s/it] 70%|████████████████████████████████████████████████████████████████████ | 155/221 [18:53<07:54, 7.19s/it] {'loss': 0.0228, 'grad_norm': 0.021970966830849648, 'learning_rate': 2.3082469195465893e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.03, 'epoch': 0.7} | |
| 70%|████████████████████████████████████████████████████████████████████ | 155/221 [18:53<07:54, 7.19s/it] 71%|████████████████████████████████████████████████████████████████████▍ | 156/221 [19:00<07:44, 7.15s/it] {'loss': 0.0225, 'grad_norm': 0.017728326842188835, 'learning_rate': 2.245515092739488e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 474.23, 'epoch': 0.71} | |
| 71%|████████████████████████████████████████████████████████████████████▍ | 156/221 [19:00<07:44, 7.15s/it] 71%|████████████████████████████████████████████████████████████████████▉ | 157/221 [19:08<07:38, 7.17s/it] {'loss': 0.0206, 'grad_norm': 0.016926869750022888, 'learning_rate': 2.1833997096818898e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.82, 'epoch': 0.71} | |
| 71%|████████████████████████████████████████████████████████████████████▉ | 157/221 [19:08<07:38, 7.17s/it] 71%|█████████████████████████████████████████████████████████████████████▎ | 158/221 [19:15<07:29, 7.14s/it] {'loss': 0.0281, 'grad_norm': 0.029541337862610817, 'learning_rate': 2.1219146715716332e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 381.89, 'epoch': 0.71} | |
| 71%|█████████████████████████████████████████████████████████████████████▎ | 158/221 [19:15<07:29, 7.14s/it] 72%|█████████████████████████████████████████████████████████████████████▊ | 159/221 [19:22<07:23, 7.15s/it] {'loss': 0.0234, 'grad_norm': 0.01689094677567482, 'learning_rate': 2.061073738537635e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 387.99, 'epoch': 0.72} | |
| 72%|█████████████████████████████████████████████████████████████████████▊ | 159/221 [19:22<07:23, 7.15s/it] 72%|██████████████████████████████████████████████████████████████████████▏ | 160/221 [19:29<07:14, 7.13s/it] {'loss': 0.033, 'grad_norm': 0.01850043050944805, 'learning_rate': 2.0008905265604316e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.06, 'epoch': 0.72} | |
| 72%|██████████████████████████████████████████████████████████████████████▏ | 160/221 [19:29<07:14, 7.13s/it] 73%|██████████████████████████████████████████████████████████████████████▋ | 161/221 [19:36<07:07, 7.12s/it] {'loss': 0.033, 'grad_norm': 0.020465202629566193, 'learning_rate': 1.9413785044249678e-05, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 422.54, 'epoch': 0.73} | |
| 73%|██████████████████████████████████████████████████████████████████████▋ | 161/221 [19:36<07:07, 7.12s/it] 73%|███████████████████████████████████████████████████████████████████████ | 162/221 [19:43<07:00, 7.12s/it] {'loss': 0.0302, 'grad_norm': 0.019559573382139206, 'learning_rate': 1.8825509907063327e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 482.08, 'epoch': 0.73} | |
| 73%|███████████████████████████████████████████████████████████████████████ | 162/221 [19:43<07:00, 7.12s/it] 74%|███████████████████████████████████████████████████████████████████████▌ | 163/221 [19:50<06:51, 7.10s/it] {'loss': 0.0202, 'grad_norm': 0.016423381865024567, 'learning_rate': 1.8244211507891063e-05, 'memory/max_active (GiB)': 48.73, 'memory/max_allocated (GiB)': 48.73, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 425.58, 'epoch': 0.74} | |
| 74%|███████████████████████████████████████████████████████████████████████▌ | 163/221 [19:50<06:51, 7.10s/it] 74%|███████████████████████████████████████████████████████████████████████▉ | 164/221 [19:57<06:44, 7.10s/it] {'loss': 0.0257, 'grad_norm': 0.01980419084429741, 'learning_rate': 1.7670019939210024e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 411.46, 'epoch': 0.74} | |
| 74%|███████████████████████████████████████████████████████████████████████▉ | 164/221 [19:57<06:44, 7.10s/it] 75%|████████████████████████████████████████████████████████████████████████▍ | 165/221 [20:05<06:39, 7.13s/it] {'loss': 0.025, 'grad_norm': 0.021348468959331512, 'learning_rate': 1.7103063703014372e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 498.15, 'epoch': 0.75} | |
| 75%|████████████████████████████████████████████████████████████████████████▍ | 165/221 [20:05<06:39, 7.13s/it] 75%|████████████████████████████████████████████████████████████████████████▊ | 166/221 [20:12<06:29, 7.09s/it] {'loss': 0.026, 'grad_norm': 0.01638958230614662, 'learning_rate': 1.6543469682057106e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 437.25, 'epoch': 0.75} | |
| 75%|████████████████████████████████████████████████████████████████████████▊ | 166/221 [20:12<06:29, 7.09s/it] 76%|█████████████████████████████████████████████████████████████████████████▎ | 167/221 [20:19<06:23, 7.10s/it] {'loss': 0.0306, 'grad_norm': 0.02299441583454609, 'learning_rate': 1.599136311145402e-05, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 460.47, 'epoch': 0.76} | |
| 76%|█████████████████████████████████████████████████████████████████████████▎ | 167/221 [20:19<06:23, 7.10s/it] 76%|█████████████████████████████████████████████████████████████████████████▋ | 168/221 [20:26<06:15, 7.09s/it] {'loss': 0.025, 'grad_norm': 0.017598189413547516, 'learning_rate': 1.544686755065677e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 478.71, 'epoch': 0.76} | |
| 76%|█████████████████████████████████████████████████████████████████████████▋ | 168/221 [20:26<06:15, 7.09s/it] 76%|██████████████████████████████████████████████████████████████████████████▏ | 169/221 [20:33<06:10, 7.13s/it] {'loss': 0.0236, 'grad_norm': 0.01685059629380703, 'learning_rate': 1.4910104855800427e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.43, 'epoch': 0.76} | |
| 76%|██████████████████████████████████████████████████████████████████████████▏ | 169/221 [20:33<06:10, 7.13s/it] 77%|██████████████████████████████████████████████████████████████████████████▌ | 170/221 [20:40<06:03, 7.13s/it] {'loss': 0.0272, 'grad_norm': 0.018304958939552307, 'learning_rate': 1.438119515243277e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 451.61, 'epoch': 0.77} | |
| 77%|██████████████████████████████████████████████████████████████████████████▌ | 170/221 [20:40<06:03, 7.13s/it] 77%|███████████████████████████████████████████████████████████████████████████ | 171/221 [20:47<05:56, 7.12s/it] {'loss': 0.022, 'grad_norm': 0.018485043197870255, 'learning_rate': 1.3860256808630428e-05, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 461.29, 'epoch': 0.77} | |
| 77%|███████████████████████████████████████████████████████████████████████████ | 171/221 [20:47<05:56, 7.12s/it] 78%|███████████████████████████████████████████████████████████████████████████▍ | 172/221 [20:54<05:50, 7.15s/it] {'loss': 0.0244, 'grad_norm': 0.018180640414357185, 'learning_rate': 1.3347406408508695e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 401.39, 'epoch': 0.78} | |
| 78%|███████████████████████████████████████████████████████████████████████████▍ | 172/221 [20:54<05:50, 7.15s/it] 78%|███████████████████████████████████████████████████████████████████████████▉ | 173/221 [21:01<05:42, 7.13s/it] {'loss': 0.0284, 'grad_norm': 0.018684981390833855, 'learning_rate': 1.2842758726130283e-05, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.88, 'epoch': 0.78} | |
| 78%|███████████████████████████████████████████████████████████████████████████▉ | 173/221 [21:01<05:42, 7.13s/it] 79%|████████████████████████████████████████████████████████████████████████████▎ | 174/221 [21:09<05:36, 7.16s/it] {'loss': 0.0289, 'grad_norm': 0.021512368693947792, 'learning_rate': 1.2346426699819458e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 383.13, 'epoch': 0.79} | |
| 79%|████████████████████████████████████████████████████████████████████████████▎ | 174/221 [21:09<05:36, 7.16s/it] 79%|████████████████████████████████████████████████████████████████████████████▊ | 175/221 [21:16<05:29, 7.15s/it] {'loss': 0.0279, 'grad_norm': 0.023360926657915115, 'learning_rate': 1.1858521406886675e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 361.55, 'epoch': 0.79} | |
| 79%|████████████████████████████████████████████████████████████████████████████▊ | 175/221 [21:16<05:29, 7.15s/it] 80%|█████████████████████████████████████████████████████████████████████████████▏ | 176/221 [21:23<05:21, 7.14s/it] {'loss': 0.0279, 'grad_norm': 0.022415969520807266, 'learning_rate': 1.137915203877003e-05, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 430.44, 'epoch': 0.8} | |
| 80%|█████████████████████████████████████████████████████████████████████████████▏ | 176/221 [21:23<05:21, 7.14s/it] 80%|█████████████████████████████████████████████████████████████████████████████▋ | 177/221 [21:30<05:14, 7.16s/it] {'loss': 0.0256, 'grad_norm': 0.01842794381082058, 'learning_rate': 1.090842587659851e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 407.33, 'epoch': 0.8} | |
| 80%|█████████████████████████████████████████████████████████████████████████████▋ | 177/221 [21:30<05:14, 7.16s/it] 81%|██████████████████████████████████████████████████████████████████████████████▏ | 178/221 [21:37<05:07, 7.15s/it] {'loss': 0.0282, 'grad_norm': 0.021043118089437485, 'learning_rate': 1.0446448267182952e-05, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 398.08, 'epoch': 0.81} | |
| 81%|██████████████████████████████████████████████████████████████████████████████▏ | 178/221 [21:37<05:07, 7.15s/it] 81%|██████████████████████████████████████████████████████████████████████████████▌ | 179/221 [21:44<05:01, 7.17s/it] {'loss': 0.0266, 'grad_norm': 0.020777752622961998, 'learning_rate': 9.993322599439692e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.9, 'epoch': 0.81} | |
| 81%|██████████████████████████████████████████████████████████████████████████████▌ | 179/221 [21:44<05:01, 7.17s/it] 81%|███████████████████████████████████████████████████████████████████████████████ | 180/221 [21:52<04:52, 7.14s/it] {'loss': 0.0365, 'grad_norm': 0.026739781722426414, 'learning_rate': 9.549150281252633e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 380.35, 'epoch': 0.81} | |
| 81%|███████████████████████████████████████████████████████████████████████████████ | 180/221 [21:52<04:52, 7.14s/it][2025-11-27 00:44:23,881] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-180 | |
| 82%|███████████████████████████████████████████████████████████████████████████████▍ | 181/221 [22:11<07:11, 10.79s/it] {'loss': 0.0358, 'grad_norm': 0.01902214251458645, 'learning_rate': 9.114030716778433e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 465.08, 'epoch': 0.82} | |
| 82%|███████████████████████████████████████████████████████████████████████████████▍ | 181/221 [22:11<07:11, 10.79s/it] 82%|███████████████████████████████████████████████████████████████████████████████▉ | 182/221 [22:18<06:19, 9.74s/it] {'loss': 0.0249, 'grad_norm': 0.019700270146131516, 'learning_rate': 8.688061284200266e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 343.96, 'epoch': 0.82} | |
| 82%|███████████████████████████████████████████████████████████████████████████████▉ | 182/221 [22:18<06:19, 9.74s/it] 83%|████████████████████████████████████████████████████████████████████████████████▎ | 183/221 [22:25<05:40, 8.95s/it] {'loss': 0.0259, 'grad_norm': 0.021064477041363716, 'learning_rate': 8.271337313934869e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 510.67, 'epoch': 0.83} | |
| 83%|████████████████████████████████████████████████████████████████████████████████▎ | 183/221 [22:25<05:40, 8.95s/it] 83%|████████████████████████████████████████████████████████████████████████████████▊ | 184/221 [22:33<05:13, 8.47s/it] {'loss': 0.0291, 'grad_norm': 0.01918896846473217, 'learning_rate': 7.863952067298042e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 285.99, 'epoch': 0.83} | |
| 83%|████████████████████████████████████████████████████████████████████████████████▊ | 184/221 [22:33<05:13, 8.47s/it] 84%|█████████████████████████████████████████████████████████████████████████████████▏ | 185/221 [22:40<04:50, 8.08s/it] {'loss': 0.0298, 'grad_norm': 0.020457495003938675, 'learning_rate': 7.465996715633028e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 322.48, 'epoch': 0.84} | |
| 84%|█████████████████████████████████████████████████████████████████████████████████▏ | 185/221 [22:40<04:50, 8.08s/it] 84%|█████████████████████████████████████████████████████████████████████████████████▋ | 186/221 [22:47<04:32, 7.77s/it] {'loss': 0.0276, 'grad_norm': 0.019534621387720108, 'learning_rate': 7.077560319906695e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 527.31, 'epoch': 0.84} | |
| 84%|█████████████████████████████████████████████████████████████████████████████████▋ | 186/221 [22:47<04:32, 7.77s/it] 85%|██████████████████████████████████████████████████████████████████████████████████ | 187/221 [22:54<04:16, 7.55s/it] {'loss': 0.0284, 'grad_norm': 0.019534330815076828, 'learning_rate': 6.698729810778065e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 406.99, 'epoch': 0.85} | |
| 85%|██████████████████████████████████████████████████████████████████████████████████ | 187/221 [22:54<04:16, 7.55s/it] 85%|██████████████████████████████████████████████████████████████████████████████████▌ | 188/221 [23:01<04:04, 7.42s/it] {'loss': 0.0306, 'grad_norm': 0.032107334583997726, 'learning_rate': 6.329589969143518e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 421.76, 'epoch': 0.85} | |
| 85%|██████████████████████████████████████████████████████████████████████████████████▌ | 188/221 [23:01<04:04, 7.42s/it] 86%|██████████████████████████████████████████████████████████████████████████████████▉ | 189/221 [23:08<03:56, 7.38s/it] {'loss': 0.0412, 'grad_norm': 0.019100667908787727, 'learning_rate': 5.9702234071631e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 470.21, 'epoch': 0.86} | |
| 86%|██████████████████████████████████████████████████████████████████████████████████▉ | 189/221 [23:08<03:56, 7.38s/it] 86%|███████████████████████████████████████████████████████████████████████████████████▍ | 190/221 [23:15<03:46, 7.31s/it] {'loss': 0.027, 'grad_norm': 0.01997012086212635, 'learning_rate': 5.620710549772295e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 335.96, 'epoch': 0.86} | |
| 86%|███████████████████████████████████████████████████████████████████████████████████▍ | 190/221 [23:15<03:46, 7.31s/it] 86%|███████████████████████████████████████████████████████████████████████████████████▊ | 191/221 [23:22<03:37, 7.25s/it] {'loss': 0.0248, 'grad_norm': 0.01957276090979576, 'learning_rate': 5.281129616683167e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 355.72, 'epoch': 0.86} | |
| 86%|███████████████████████████████████████████████████████████████████████████████████▊ | 191/221 [23:23<03:37, 7.25s/it] 87%|████████████████████████████████████████████████████████████████████████████████████▎ | 192/221 [23:30<03:29, 7.22s/it] {'loss': 0.0266, 'grad_norm': 0.019423488527536392, 'learning_rate': 4.951556604879048e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 393.6, 'epoch': 0.87} | |
| 87%|████████████████████████████████████████████████████████████████████████████████████▎ | 192/221 [23:30<03:29, 7.22s/it] 87%|████████████████████████████████████████████████████████████████████████████████████▋ | 193/221 [23:37<03:22, 7.25s/it] {'loss': 0.0246, 'grad_norm': 0.021110599860548973, 'learning_rate': 4.632065271606756e-06, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 423.45, 'epoch': 0.87} | |
| 87%|████████████████████████████████████████████████████████████████████████████████████▋ | 193/221 [23:37<03:22, 7.25s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████▏ | 194/221 [23:44<03:14, 7.21s/it] {'loss': 0.026, 'grad_norm': 0.02191292867064476, 'learning_rate': 4.322727117869951e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 404.69, 'epoch': 0.88} | |
| 88%|█████████████████████████████████████████████████████████████████████████████████████▏ | 194/221 [23:44<03:14, 7.21s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████▌ | 195/221 [23:51<03:05, 7.15s/it] {'loss': 0.0277, 'grad_norm': 0.018202103674411774, 'learning_rate': 4.023611372427471e-06, 'memory/max_active (GiB)': 48.81, 'memory/max_allocated (GiB)': 48.81, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 501.69, 'epoch': 0.88} | |
| 88%|█████████████████████████████████████████████████████████████████████████████████████▌ | 195/221 [23:51<03:05, 7.15s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████ | 196/221 [23:58<02:58, 7.13s/it] {'loss': 0.0203, 'grad_norm': 0.016622671857476234, 'learning_rate': 3.734784976300165e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 453.02, 'epoch': 0.89} | |
| 89%|██████████████████████████████████████████████████████████████████████████████████████ | 196/221 [23:58<02:58, 7.13s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████▍ | 197/221 [24:05<02:51, 7.16s/it] {'loss': 0.0281, 'grad_norm': 0.019572410732507706, 'learning_rate': 3.4563125677897932e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 396.87, 'epoch': 0.89} | |
| 89%|██████████████████████████████████████████████████████████████████████████████████████▍ | 197/221 [24:05<02:51, 7.16s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████▉ | 198/221 [24:13<02:44, 7.17s/it] {'loss': 0.0272, 'grad_norm': 0.02384166046977043, 'learning_rate': 3.18825646801314e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.89, 'epoch': 0.9} | |
| 90%|██████████████████████████████████████████████████████████████████████████████████████▉ | 198/221 [24:13<02:44, 7.17s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████▎ | 199/221 [24:20<02:37, 7.14s/it] {'loss': 0.0291, 'grad_norm': 0.022356677800416946, 'learning_rate': 2.930676666954846e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 367.48, 'epoch': 0.9} | |
| 90%|███████████████████████████████████████████████████████████████████████████████████████▎ | 199/221 [24:20<02:37, 7.14s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████▊ | 200/221 [24:27<02:30, 7.15s/it] {'loss': 0.0354, 'grad_norm': 0.024130800738930702, 'learning_rate': 2.6836308100417873e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 509.26, 'epoch': 0.9} | |
| 90%|███████████████████████████████████████████████████████████████████████████████████████▊ | 200/221 [24:27<02:30, 7.15s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████▏ | 201/221 [24:34<02:23, 7.17s/it] {'loss': 0.0267, 'grad_norm': 0.021222814917564392, 'learning_rate': 2.4471741852423237e-06, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 494.07, 'epoch': 0.91} | |
| 91%|████████████████████████████████████████████████████████████████████████████████████████▏ | 201/221 [24:34<02:23, 7.17s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████▋ | 202/221 [24:41<02:15, 7.16s/it] {'loss': 0.0307, 'grad_norm': 0.01970573328435421, 'learning_rate': 2.221359710692961e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 412.36, 'epoch': 0.91} | |
| 91%|████████████████████████████████████████████████████████████████████████████████████████▋ | 202/221 [24:41<02:15, 7.16s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████ | 203/221 [24:48<02:08, 7.13s/it] {'loss': 0.0234, 'grad_norm': 0.01854623667895794, 'learning_rate': 2.006237922855553e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 394.22, 'epoch': 0.92} | |
| 92%|█████████████████████████████████████████████████████████████████████████████████████████ | 203/221 [24:48<02:08, 7.13s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████▌ | 204/221 [24:55<02:01, 7.15s/it] {'loss': 0.0241, 'grad_norm': 0.0231945738196373, 'learning_rate': 1.8018569652073381e-06, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.08, 'epoch': 0.92} | |
| 92%|█████████████████████████████████████████████████████████████████████████████████████████▌ | 204/221 [24:55<02:01, 7.15s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████▉ | 205/221 [25:03<01:54, 7.13s/it] {'loss': 0.0264, 'grad_norm': 0.018703971058130264, 'learning_rate': 1.6082625774666794e-06, 'memory/max_active (GiB)': 49.0, 'memory/max_allocated (GiB)': 49.0, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 471.75, 'epoch': 0.93} | |
| 93%|█████████████████████████████████████████████████████████████████████████████████████████▉ | 205/221 [25:03<01:54, 7.13s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 206/221 [25:10<01:47, 7.14s/it] {'loss': 0.0251, 'grad_norm': 0.017928369343280792, 'learning_rate': 1.4254980853566247e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 416.2, 'epoch': 0.93} | |
| 93%|██████████████████████████████████████████████████████████████████████████████████████████▍ | 206/221 [25:10<01:47, 7.14s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████▊ | 207/221 [25:17<01:39, 7.13s/it] {'loss': 0.0239, 'grad_norm': 0.018282128497958183, 'learning_rate': 1.2536043909088191e-06, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 325.9, 'epoch': 0.94} | |
| 94%|██████████████████████████████████████████████████████████████████████████████████████████▊ | 207/221 [25:17<01:39, 7.13s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████▎ | 208/221 [25:24<01:32, 7.12s/it] {'loss': 0.0307, 'grad_norm': 0.021411525085568428, 'learning_rate': 1.0926199633097157e-06, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 382.68, 'epoch': 0.94} | |
| 94%|███████████████████████████████████████████████████████████████████████████████████████████▎ | 208/221 [25:24<01:32, 7.12s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████▋ | 209/221 [25:31<01:25, 7.17s/it] {'loss': 0.0287, 'grad_norm': 0.02065850794315338, 'learning_rate': 9.42580830291373e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 431.06, 'epoch': 0.95} | |
| 95%|███████████████████████████████████████████████████████████████████████████████████████████▋ | 209/221 [25:31<01:25, 7.17s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████▏ | 210/221 [25:39<01:19, 7.21s/it] {'loss': 0.0212, 'grad_norm': 0.019915733486413956, 'learning_rate': 8.035205700685167e-07, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 418.82, 'epoch': 0.95} | |
| 95%|████████████████████████████████████████████████████████████████████████████████████████████▏ | 210/221 [25:39<01:19, 7.21s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████▌ | 211/221 [25:46<01:11, 7.18s/it] {'loss': 0.0264, 'grad_norm': 0.020451124757528305, 'learning_rate': 6.75470303823933e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 438.8, 'epoch': 0.95} | |
| 95%|████████████████████████████████████████████████████████████████████████████████████████████▌ | 211/221 [25:46<01:11, 7.18s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████ | 212/221 [25:53<01:04, 7.17s/it] {'loss': 0.0268, 'grad_norm': 0.02076980657875538, 'learning_rate': 5.584586887435739e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 415.6, 'epoch': 0.96} | |
| 96%|█████████████████████████████████████████████████████████████████████████████████████████████ | 212/221 [25:53<01:04, 7.17s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████▍ | 213/221 [26:00<00:57, 7.14s/it] {'loss': 0.0246, 'grad_norm': 0.019138546660542488, 'learning_rate': 4.52511911603265e-07, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 364.67, 'epoch': 0.96} | |
| 96%|█████████████████████████████████████████████████████████████████████████████████████████████▍ | 213/221 [26:00<00:57, 7.14s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████▉ | 214/221 [26:07<00:49, 7.11s/it] {'loss': 0.0348, 'grad_norm': 0.026033613830804825, 'learning_rate': 3.576536829081323e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 505.19, 'epoch': 0.97} | |
| 97%|█████████████████████████████████████████████████████████████████████████████████████████████▉ | 214/221 [26:07<00:49, 7.11s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████▎ | 215/221 [26:14<00:42, 7.11s/it] {'loss': 0.0308, 'grad_norm': 0.01909700781106949, 'learning_rate': 2.7390523158633554e-07, 'memory/max_active (GiB)': 48.93, 'memory/max_allocated (GiB)': 48.93, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 402.37, 'epoch': 0.97} | |
| 97%|██████████████████████████████████████████████████████████████████████████████████████████████▎ | 215/221 [26:14<00:42, 7.11s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 216/221 [26:21<00:35, 7.17s/it] {'loss': 0.0251, 'grad_norm': 0.017447378486394882, 'learning_rate': 2.012853002380466e-07, 'memory/max_active (GiB)': 48.85, 'memory/max_allocated (GiB)': 48.85, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 436.02, 'epoch': 0.98} | |
| 98%|██████████████████████████████████████████████████████████████████████████████████████████████▊ | 216/221 [26:21<00:35, 7.17s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28, 7.16s/it] {'loss': 0.0294, 'grad_norm': 0.0193310659378767, 'learning_rate': 1.3981014094099353e-07, 'memory/max_active (GiB)': 48.97, 'memory/max_allocated (GiB)': 48.97, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 392.76, 'epoch': 0.98} | |
| 98%|███████████████████████████████████████████████████████████████████████████████████████████████▏ | 217/221 [26:28<00:28, 7.16s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21, 7.17s/it] {'loss': 0.0286, 'grad_norm': 0.023237884044647217, 'learning_rate': 8.949351161324227e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 444.9, 'epoch': 0.99} | |
| 99%|███████████████████████████████████████████████████████████████████████████████████████████████▋ | 218/221 [26:36<00:21, 7.17s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14, 7.15s/it] {'loss': 0.0264, 'grad_norm': 0.017193371430039406, 'learning_rate': 5.0346672934270534e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 442.84, 'epoch': 0.99} | |
| 99%|████████████████████████████████████████████████████████████████████████████████████████████████ | 219/221 [26:43<00:14, 7.15s/it] 100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07, 7.13s/it] {'loss': 0.0311, 'grad_norm': 0.01859343983232975, 'learning_rate': 2.237838582483387e-08, 'memory/max_active (GiB)': 48.89, 'memory/max_allocated (GiB)': 48.89, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 512.81, 'epoch': 1.0} | |
| 100%|████████████████████████████████████████████████████████████████████████████████████████████████▌| 220/221 [26:50<00:07, 7.13s/it] 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00, 7.17s/it] {'loss': 0.0256, 'grad_norm': 0.021252349019050598, 'learning_rate': 5.594909486328348e-09, 'memory/max_active (GiB)': 48.77, 'memory/max_allocated (GiB)': 48.77, 'memory/device_reserved (GiB)': 50.97, 'tokens_per_second_per_gpu': 338.96, 'epoch': 1.0} | |
| 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [26:57<00:00, 7.17s/it][2025-11-27 00:49:29,268] [INFO] [axolotl.core.trainers.base._save:665] [PID:80269] Saving model checkpoint to ./nov262025-sc-LoRA-Run/checkpoint-221 | |
| {'train_runtime': 1631.9607, 'train_samples_per_second': 17.334, 'train_steps_per_second': 0.135, 'train_loss': 0.030637798653873383, 'memory/max_active (GiB)': 15.75, 'memory/max_allocated (GiB)': 15.75, 'memory/device_reserved (GiB)': 50.97, 'epoch': 1.0} | |
| 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00, 7.17s/it] 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [27:09<00:00, 7.37s/it] | |
| [2025-11-27 00:49:33,160] [INFO] [axolotl.train.save_trained_model:218] [PID:80269] Training completed! Saving trained model to ./nov262025-sc-LoRA-Run. | |
| [2025-11-27 00:49:33,820] [INFO] [axolotl.train.save_trained_model:336] [PID:80269] Model successfully saved to ./nov262025-sc-LoRA-Run | |