qwen32b-thai-lora / debug.log
devrf's picture
Upload folder using huggingface_hub
5684a7e verified
[2025-12-27 08:31:39,582] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:8935] baseline 0.000GB ()
[2025-12-27 08:31:39,583] [INFO] [axolotl.cli.config.load_cfg:248] [PID:8935] config:
{
"activation_offloading": false,
"adapter": "lora",
"axolotl_config_path": "config.yaml",
"base_model": "Qwen/Qwen3-32B",
"base_model_config": "Qwen/Qwen3-32B",
"batch_size": 32,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_90",
"fp8": false,
"n_gpu": 1,
"n_node": 1
},
"context_parallel_size": 1,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_processes": 24,
"datasets": [
{
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "/workspace/data/wangchan_fixed",
"split": "train",
"trust_remote_code": false,
"type": "alpaca"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"env_capabilities": {
"torch_version": "2.7.1"
},
"eval_batch_size": 4,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"flash_attention": true,
"fp16": false,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": true
},
"include_tkps": true,
"learning_rate": 0.0001,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"micro_batch_size": 4,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"optimizer": "adamw_torch",
"output_dir": "./outputs/qwen32b-thai",
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 300,
"sequence_len": 2048,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-32B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"scale_rewards": true,
"sync_ref_model": false,
"use_vllm": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"world_size": 1
}
[2025-12-27 08:31:40,073] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8935] EOS: 151645 / <|im_end|>
[2025-12-27 08:31:40,073] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8935] BOS: None / None
[2025-12-27 08:31:40,073] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8935] PAD: 151643 / <|endoftext|>
[2025-12-27 08:31:40,074] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8935] UNK: None / None
[2025-12-27 08:31:40,074] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:8935] Unable to find prepared dataset in last_run_prepared/b99cf4adb2b12295d2c59391c5b4de0d
[2025-12-27 08:31:40,075] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:8935] Loading raw datasets...
[2025-12-27 08:31:40,075] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:8935] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2025-12-27 08:31:40,087] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:8935] Loading dataset: /workspace/data/wangchan_fixed with base_type: alpaca and prompt_style: None
[2025-12-27 08:31:40,292] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:8935] min_input_len: 86
[2025-12-27 08:31:40,293] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:8935] max_input_len: 4096
Dropping Long Sequences (>2048) (num_proc=24): 0%| | 0/32207 [00:00<?, ? examples/s] Dropping Long Sequences (>2048) (num_proc=24): 3%|β–Ž | 1000/32207 [00:00<00:23, 1321.76 examples/s] Dropping Long Sequences (>2048) (num_proc=24): 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 22342/32207 [00:00<00:00, 32261.24 examples/s] Dropping Long Sequences (>2048) (num_proc=24): 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 30865/32207 [00:01<00:00, 36983.78 examples/s] Dropping Long Sequences (>2048) (num_proc=24): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32207/32207 [00:01<00:00, 24307.06 examples/s]
[2025-12-27 08:31:41,663] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:8935] Dropped 9117 samples from dataset
Saving the dataset (0/24 shards): 0%| | 0/23090 [00:00<?, ? examples/s] Saving the dataset (0/24 shards): 4%|▍ | 963/23090 [00:00<00:05, 3702.16 examples/s] Saving the dataset (1/24 shards): 4%|▍ | 963/23090 [00:00<00:05, 3702.16 examples/s] Saving the dataset (2/24 shards): 8%|β–Š | 1925/23090 [00:00<00:05, 3702.16 examples/s] Saving the dataset (3/24 shards): 13%|β–ˆβ–Ž | 2887/23090 [00:00<00:05, 3702.16 examples/s] Saving the dataset (4/24 shards): 17%|β–ˆβ–‹ | 3849/23090 [00:00<00:05, 3702.16 examples/s] Saving the dataset (5/24 shards): 21%|β–ˆβ–ˆ | 4811/23090 [00:00<00:04, 3702.16 examples/s] Saving the dataset (6/24 shards): 25%|β–ˆβ–ˆβ–Œ | 5773/23090 [00:00<00:04, 3702.16 examples/s] Saving the dataset (7/24 shards): 29%|β–ˆβ–ˆβ–‰ | 6735/23090 [00:00<00:04, 3702.16 examples/s] Saving the dataset (8/24 shards): 33%|β–ˆβ–ˆβ–ˆβ–Ž | 7697/23090 [00:00<00:04, 3702.16 examples/s] Saving the dataset (9/24 shards): 38%|β–ˆβ–ˆβ–ˆβ–Š | 8659/23090 [00:00<00:03, 3702.16 examples/s] Saving the dataset (10/24 shards): 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 9621/23090 [00:00<00:03, 3702.16 examples/s] Saving the dataset (11/24 shards): 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 10583/23090 [00:00<00:03, 3702.16 examples/s] Saving the dataset (12/24 shards): 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 11545/23090 [00:00<00:03, 3702.16 examples/s] Saving the dataset (13/24 shards): 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 12507/23090 [00:00<00:02, 3702.16 examples/s] Saving the dataset (14/24 shards): 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 13469/23090 [00:00<00:02, 3702.16 examples/s] Saving the dataset (15/24 shards): 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 14431/23090 [00:00<00:02, 3702.16 examples/s] Saving the dataset (16/24 shards): 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 15393/23090 [00:00<00:02, 3702.16 examples/s] Saving the dataset (17/24 shards): 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17317/23090 [00:00<00:01, 3702.16 examples/s] Saving the dataset (18/24 shards): 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 17317/23090 [00:00<00:01, 3702.16 examples/s] Saving the dataset (19/24 shards): 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 18279/23090 [00:00<00:01, 3702.16 examples/s] Saving the dataset (20/24 shards): 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 19241/23090 [00:00<00:01, 3702.16 examples/s] Saving the dataset (21/24 shards): 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 20203/23090 [00:00<00:00, 3702.16 examples/s] Saving the dataset (22/24 shards): 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 21165/23090 [00:00<00:00, 3702.16 examples/s] Saving the dataset (23/24 shards): 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 22127/23090 [00:00<00:00, 3702.16 examples/s] Saving the dataset (24/24 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23090/23090 [00:00<00:00, 3702.16 examples/s] Saving the dataset (24/24 shards): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 23090/23090 [00:00<00:00, 55006.16 examples/s]
[2025-12-27 08:31:42,283] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:8935] total_num_tokens: 23_442_129
[2025-12-27 08:31:42,613] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:8935] `total_supervised_tokens: 24_827_365`
[2025-12-27 08:31:42,614] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:8935] total_num_steps: 1444
[2025-12-27 08:31:42,614] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:8935] Maximum number of steps set at 1444
[2025-12-27 08:31:42,639] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:8935] Loading tokenizer... Qwen/Qwen3-32B
[2025-12-27 08:31:43,077] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:8935] EOS: 151645 / <|im_end|>
[2025-12-27 08:31:43,077] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:8935] BOS: None / None
[2025-12-27 08:31:43,078] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:8935] PAD: 151643 / <|endoftext|>
[2025-12-27 08:31:43,078] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:8935] UNK: None / None
[2025-12-27 08:31:43,078] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:8935] Loading model
[2025-12-27 08:31:43,134] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:8935] Patched Trainer.evaluation_loop with nanmean loss calculation
[2025-12-27 08:31:43,136] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:8935] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
Loading checkpoint shards: 0%| | 0/17 [00:00<?, ?it/s] Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17/17 [00:00<00:00, 197.40it/s]
[2025-12-27 08:31:46,056] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:8935] Converting modules to torch.bfloat16
[2025-12-27 08:31:48,057] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8935] Memory usage after model load 0.000GB ()
trainable params: 268,435,456 || all params: 33,030,558,720 || trainable%: 0.8127
[2025-12-27 08:31:50,286] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:8935] after adapters 0.000GB ()
[2025-12-27 08:32:05,781] [INFO] [axolotl.train.save_initial_configs:398] [PID:8935] Pre-saving adapter config to ./outputs/qwen32b-thai...
[2025-12-27 08:32:05,784] [INFO] [axolotl.train.save_initial_configs:402] [PID:8935] Pre-saving tokenizer to ./outputs/qwen32b-thai...
[2025-12-27 08:32:05,933] [INFO] [axolotl.train.save_initial_configs:407] [PID:8935] Pre-saving model config to ./outputs/qwen32b-thai...
[2025-12-27 08:32:05,938] [INFO] [axolotl.train.execute_training:196] [PID:8935] Starting trainer...
0%| | 0/1444 [00:00<?, ?it/s] 0%| | 1/1444 [00:26<10:31:39, 26.26s/it] 0%| | 2/1444 [00:52<10:37:32, 26.53s/it] 0%| | 3/1444 [01:16<10:08:02, 25.32s/it] 0%| | 4/1444 [01:40<9:55:27, 24.81s/it] 0%| | 5/1444 [02:08<10:18:41, 25.80s/it] 0%| | 6/1444 [02:33<10:12:00, 25.54s/it] 0%| | 7/1444 [03:00<10:24:45, 26.09s/it] 1%| | 8/1444 [03:27<10:29:36, 26.31s/it] 1%| | 9/1444 [03:51<10:12:59, 25.63s/it] 1%| | 10/1444 [04:13<9:43:57, 24.43s/it] {'loss': 0.8494, 'grad_norm': 0.1854863315820694, 'learning_rate': 2.0930232558139536e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 89.89, 'tokens_per_second_per_gpu': 1111.65, 'epoch': 0.01}
1%| | 10/1444 [04:13<9:43:57, 24.43s/it] 1%| | 11/1444 [04:35<9:25:57, 23.70s/it] 1%| | 12/1444 [04:57<9:15:57, 23.29s/it] 1%| | 13/1444 [05:22<9:29:19, 23.87s/it] 1%| | 14/1444 [05:47<9:34:16, 24.10s/it] 1%| | 15/1444 [06:12<9:39:32, 24.33s/it] 1%| | 16/1444 [06:35<9:29:10, 23.91s/it] 1%| | 17/1444 [06:59<9:26:58, 23.84s/it] 1%| | 18/1444 [07:25<9:47:47, 24.73s/it] 1%|▏ | 19/1444 [07:49<9:41:22, 24.48s/it] 1%|▏ | 20/1444 [08:12<9:31:08, 24.06s/it] {'loss': 0.7842, 'grad_norm': 0.09567277133464813, 'learning_rate': 4.418604651162791e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.37, 'tokens_per_second_per_gpu': 1041.18, 'epoch': 0.03}
1%|▏ | 20/1444 [08:12<9:31:08, 24.06s/it] 1%|▏ | 21/1444 [08:36<9:30:12, 24.04s/it] 2%|▏ | 22/1444 [09:00<9:26:01, 23.88s/it] 2%|▏ | 23/1444 [09:24<9:29:03, 24.03s/it] 2%|▏ | 24/1444 [09:47<9:16:15, 23.50s/it] 2%|▏ | 25/1444 [10:13<9:38:12, 24.45s/it] 2%|▏ | 26/1444 [10:37<9:36:16, 24.38s/it] 2%|▏ | 27/1444 [11:03<9:45:32, 24.79s/it] 2%|▏ | 28/1444 [11:28<9:46:41, 24.86s/it] 2%|▏ | 29/1444 [11:55<10:02:07, 25.53s/it] 2%|▏ | 30/1444 [12:19<9:48:18, 24.96s/it] {'loss': 0.7353, 'grad_norm': 0.1211227998137474, 'learning_rate': 6.744186046511628e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.37, 'tokens_per_second_per_gpu': 993.82, 'epoch': 0.04}
2%|▏ | 30/1444 [12:19<9:48:18, 24.96s/it] 2%|▏ | 31/1444 [12:47<10:09:00, 25.86s/it] 2%|▏ | 32/1444 [13:11<9:56:21, 25.34s/it] 2%|▏ | 33/1444 [13:34<9:42:04, 24.75s/it] 2%|▏ | 34/1444 [13:57<9:24:17, 24.01s/it] 2%|▏ | 35/1444 [14:24<9:50:31, 25.15s/it] 2%|▏ | 36/1444 [14:49<9:45:40, 24.96s/it] 3%|β–Ž | 37/1444 [15:11<9:27:47, 24.21s/it] 3%|β–Ž | 38/1444 [15:36<9:28:13, 24.25s/it] 3%|β–Ž | 39/1444 [16:02<9:38:31, 24.71s/it] 3%|β–Ž | 40/1444 [16:29<9:55:28, 25.45s/it] {'loss': 0.6943, 'grad_norm': 0.09696491807699203, 'learning_rate': 9.069767441860465e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.37, 'tokens_per_second_per_gpu': 849.91, 'epoch': 0.06}
3%|β–Ž | 40/1444 [16:29<9:55:28, 25.45s/it] 3%|β–Ž | 41/1444 [16:54<9:55:55, 25.49s/it] 3%|β–Ž | 42/1444 [17:18<9:45:26, 25.05s/it] 3%|β–Ž | 43/1444 [17:44<9:47:24, 25.16s/it] 3%|β–Ž | 44/1444 [18:09<9:50:17, 25.30s/it] 3%|β–Ž | 45/1444 [18:32<9:33:07, 24.58s/it] 3%|β–Ž | 46/1444 [18:57<9:32:00, 24.55s/it] 3%|β–Ž | 47/1444 [19:22<9:37:35, 24.81s/it] 3%|β–Ž | 48/1444 [19:47<9:34:31, 24.69s/it] 3%|β–Ž | 49/1444 [20:11<9:33:03, 24.65s/it] 3%|β–Ž | 50/1444 [20:36<9:35:06, 24.75s/it] {'loss': 0.6814, 'grad_norm': 0.12007619440555573, 'learning_rate': 9.999547457436221e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 971.92, 'epoch': 0.07}
3%|β–Ž | 50/1444 [20:36<9:35:06, 24.75s/it] 4%|β–Ž | 51/1444 [20:59<9:21:06, 24.17s/it] 4%|β–Ž | 52/1444 [21:22<9:15:34, 23.95s/it] 4%|β–Ž | 53/1444 [21:50<9:39:41, 25.00s/it] 4%|β–Ž | 54/1444 [22:12<9:16:22, 24.02s/it] 4%|▍ | 55/1444 [22:37<9:25:52, 24.44s/it] 4%|▍ | 56/1444 [23:04<9:42:11, 25.17s/it] 4%|▍ | 57/1444 [23:30<9:48:11, 25.44s/it] 4%|▍ | 58/1444 [23:56<9:53:11, 25.68s/it] 4%|▍ | 59/1444 [24:18<9:26:08, 24.53s/it] 4%|▍ | 60/1444 [24:45<9:44:20, 25.33s/it] {'loss': 0.69, 'grad_norm': 0.12358752638101578, 'learning_rate': 9.996782216198338e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 862.06, 'epoch': 0.08}
4%|▍ | 60/1444 [24:45<9:44:20, 25.33s/it] 4%|▍ | 61/1444 [25:10<9:43:12, 25.30s/it] 4%|▍ | 62/1444 [25:36<9:41:49, 25.26s/it] 4%|▍ | 63/1444 [26:01<9:39:58, 25.20s/it] 4%|▍ | 64/1444 [26:25<9:35:35, 25.03s/it] 5%|▍ | 65/1444 [26:50<9:35:49, 25.05s/it] 5%|▍ | 66/1444 [27:15<9:34:31, 25.02s/it] 5%|▍ | 67/1444 [27:40<9:32:47, 24.96s/it] 5%|▍ | 68/1444 [28:01<9:01:22, 23.61s/it] 5%|▍ | 69/1444 [28:26<9:10:42, 24.03s/it] 5%|▍ | 70/1444 [28:49<9:05:39, 23.83s/it] {'loss': 0.7048, 'grad_norm': 0.11916535347700119, 'learning_rate': 9.991504534967746e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.38, 'tokens_per_second_per_gpu': 1024.18, 'epoch': 0.1}
5%|▍ | 70/1444 [28:49<9:05:39, 23.83s/it] 5%|▍ | 71/1444 [29:14<9:16:39, 24.33s/it] 5%|▍ | 72/1444 [29:38<9:13:55, 24.22s/it] 5%|β–Œ | 73/1444 [30:04<9:22:01, 24.60s/it] 5%|β–Œ | 74/1444 [30:27<9:14:26, 24.28s/it] 5%|β–Œ | 75/1444 [30:55<9:33:41, 25.14s/it] 5%|β–Œ | 76/1444 [31:18<9:22:11, 24.66s/it] 5%|β–Œ | 77/1444 [31:41<9:09:03, 24.10s/it] 5%|β–Œ | 78/1444 [32:08<9:28:52, 24.99s/it] 5%|β–Œ | 79/1444 [32:33<9:27:21, 24.94s/it] 6%|β–Œ | 80/1444 [32:57<9:18:27, 24.57s/it] {'loss': 0.6705, 'grad_norm': 0.12464027106761932, 'learning_rate': 9.983717067423721e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1009.08, 'epoch': 0.11}
6%|β–Œ | 80/1444 [32:57<9:18:27, 24.57s/it] 6%|β–Œ | 81/1444 [33:19<9:06:40, 24.06s/it] 6%|β–Œ | 82/1444 [33:45<9:14:05, 24.41s/it] 6%|β–Œ | 83/1444 [34:10<9:21:10, 24.74s/it] 6%|β–Œ | 84/1444 [34:35<9:22:13, 24.80s/it] 6%|β–Œ | 85/1444 [35:01<9:28:47, 25.11s/it] 6%|β–Œ | 86/1444 [35:23<9:06:33, 24.15s/it] 6%|β–Œ | 87/1444 [35:46<9:01:54, 23.96s/it] 6%|β–Œ | 88/1444 [36:09<8:53:45, 23.62s/it] 6%|β–Œ | 89/1444 [36:30<8:36:19, 22.86s/it] 6%|β–Œ | 90/1444 [36:54<8:39:15, 23.01s/it] {'loss': 0.6387, 'grad_norm': 0.1264505237340927, 'learning_rate': 9.973423729195168e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1027.91, 'epoch': 0.12}
6%|β–Œ | 90/1444 [36:54<8:39:15, 23.01s/it] 6%|β–‹ | 91/1444 [37:14<8:24:04, 22.35s/it] 6%|β–‹ | 92/1444 [37:33<7:59:20, 21.27s/it] 6%|β–‹ | 93/1444 [38:02<8:47:14, 23.42s/it] 7%|β–‹ | 94/1444 [38:26<8:56:50, 23.86s/it] 7%|β–‹ | 95/1444 [38:50<8:54:34, 23.78s/it] 7%|β–‹ | 96/1444 [39:15<8:59:27, 24.01s/it] 7%|β–‹ | 97/1444 [39:40<9:08:31, 24.43s/it] 7%|β–‹ | 98/1444 [40:04<9:02:02, 24.16s/it] 7%|β–‹ | 99/1444 [40:27<8:59:17, 24.06s/it] 7%|β–‹ | 100/1444 [40:52<9:02:49, 24.23s/it] {'loss': 0.6447, 'grad_norm': 0.1262999027967453, 'learning_rate': 9.960629695891814e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 935.25, 'epoch': 0.14}
7%|β–‹ | 100/1444 [40:52<9:02:49, 24.23s/it] 7%|β–‹ | 101/1444 [41:14<8:48:58, 23.63s/it] 7%|β–‹ | 102/1444 [41:36<8:34:06, 22.99s/it] 7%|β–‹ | 103/1444 [42:03<9:04:27, 24.36s/it] 7%|β–‹ | 104/1444 [42:30<9:16:44, 24.93s/it] 7%|β–‹ | 105/1444 [42:54<9:10:06, 24.65s/it] 7%|β–‹ | 106/1444 [43:21<9:27:35, 25.45s/it] 7%|β–‹ | 107/1444 [43:45<9:15:11, 24.92s/it] 7%|β–‹ | 108/1444 [44:11<9:22:10, 25.25s/it] 8%|β–Š | 109/1444 [44:35<9:13:14, 24.87s/it] 8%|β–Š | 110/1444 [44:59<9:10:21, 24.75s/it] {'loss': 0.6846, 'grad_norm': 0.12777547538280487, 'learning_rate': 9.945341400501838e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 997.71, 'epoch': 0.15}
8%|β–Š | 110/1444 [44:59<9:10:21, 24.75s/it] 8%|β–Š | 111/1444 [45:26<9:27:27, 25.54s/it] 8%|β–Š | 112/1444 [45:50<9:12:45, 24.90s/it] 8%|β–Š | 113/1444 [46:14<9:09:55, 24.79s/it] 8%|β–Š | 114/1444 [46:37<8:53:51, 24.08s/it] 8%|β–Š | 115/1444 [47:02<9:03:04, 24.52s/it] 8%|β–Š | 116/1444 [47:28<9:13:22, 25.00s/it] 8%|β–Š | 117/1444 [47:49<8:43:39, 23.68s/it] 8%|β–Š | 118/1444 [48:15<8:57:17, 24.31s/it] 8%|β–Š | 119/1444 [48:43<9:19:47, 25.35s/it] 8%|β–Š | 120/1444 [49:03<8:48:25, 23.95s/it] {'loss': 0.6765, 'grad_norm': 0.12563012540340424, 'learning_rate': 9.927566530157298e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1120.19, 'epoch': 0.17}
8%|β–Š | 120/1444 [49:03<8:48:25, 23.95s/it] 8%|β–Š | 121/1444 [49:29<8:59:42, 24.48s/it] 8%|β–Š | 122/1444 [49:52<8:51:24, 24.12s/it] 9%|β–Š | 123/1444 [50:17<8:53:46, 24.24s/it] 9%|β–Š | 124/1444 [50:45<9:20:21, 25.47s/it] 9%|β–Š | 125/1444 [51:13<9:36:00, 26.20s/it] 9%|β–Š | 126/1444 [51:35<9:09:21, 25.01s/it] 9%|β–‰ | 127/1444 [52:00<9:04:23, 24.80s/it] 9%|β–‰ | 128/1444 [52:25<9:11:09, 25.13s/it] 9%|β–‰ | 129/1444 [52:48<8:53:55, 24.36s/it] 9%|β–‰ | 130/1444 [53:16<9:19:25, 25.54s/it] {'loss': 0.6315, 'grad_norm': 0.13785897195339203, 'learning_rate': 9.907314022268946e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 801.56, 'epoch': 0.18}
9%|β–‰ | 130/1444 [53:16<9:19:25, 25.54s/it] 9%|β–‰ | 131/1444 [53:39<8:58:03, 24.59s/it] 9%|β–‰ | 132/1444 [54:02<8:48:46, 24.18s/it] 9%|β–‰ | 133/1444 [54:27<8:55:34, 24.51s/it] 9%|β–‰ | 134/1444 [54:54<9:08:07, 25.10s/it] 9%|β–‰ | 135/1444 [55:21<9:24:14, 25.86s/it] 9%|β–‰ | 136/1444 [55:48<9:25:51, 25.96s/it] 9%|β–‰ | 137/1444 [56:12<9:13:18, 25.40s/it] 10%|β–‰ | 138/1444 [56:36<9:07:21, 25.15s/it] 10%|β–‰ | 139/1444 [57:01<9:05:58, 25.10s/it] 10%|β–‰ | 140/1444 [57:25<8:58:28, 24.78s/it] {'loss': 0.6642, 'grad_norm': 0.14731284976005554, 'learning_rate': 9.884594060032406e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 944.29, 'epoch': 0.19}
10%|β–‰ | 140/1444 [57:25<8:58:28, 24.78s/it] 10%|β–‰ | 141/1444 [57:49<8:50:16, 24.42s/it] 10%|β–‰ | 142/1444 [58:11<8:35:11, 23.74s/it] 10%|β–‰ | 143/1444 [58:37<8:49:00, 24.40s/it] 10%|β–‰ | 144/1444 [59:03<8:57:19, 24.80s/it] 10%|β–ˆ | 145/1444 [59:27<8:52:40, 24.60s/it] 10%|β–ˆ | 146/1444 [59:55<9:15:53, 25.70s/it] 10%|β–ˆ | 147/1444 [1:00:18<9:00:07, 24.99s/it] 10%|β–ˆ | 148/1444 [1:00:44<9:04:09, 25.19s/it] 10%|β–ˆ | 149/1444 [1:01:08<8:57:51, 24.92s/it] 10%|β–ˆ | 150/1444 [1:01:34<9:03:23, 25.20s/it] {'loss': 0.6696, 'grad_norm': 0.1298578828573227, 'learning_rate': 9.859418067307928e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 913.99, 'epoch': 0.21}
10%|β–ˆ | 150/1444 [1:01:34<9:03:23, 25.20s/it] 10%|β–ˆ | 151/1444 [1:01:59<9:01:31, 25.13s/it] 11%|β–ˆ | 152/1444 [1:02:22<8:46:12, 24.44s/it] 11%|β–ˆ | 153/1444 [1:02:44<8:31:10, 23.76s/it] 11%|β–ˆ | 154/1444 [1:03:06<8:21:53, 23.34s/it] 11%|β–ˆ | 155/1444 [1:03:33<8:44:50, 24.43s/it] 11%|β–ˆ | 156/1444 [1:03:55<8:28:06, 23.67s/it] 11%|β–ˆ | 157/1444 [1:04:15<8:03:37, 22.55s/it] 11%|β–ˆ | 158/1444 [1:04:42<8:28:01, 23.70s/it] 11%|β–ˆ | 159/1444 [1:05:12<9:07:43, 25.58s/it] 11%|β–ˆ | 160/1444 [1:05:36<8:57:01, 25.09s/it] {'loss': 0.6768, 'grad_norm': 0.13122966885566711, 'learning_rate': 9.831798702876352e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 983.71, 'epoch': 0.22}
11%|β–ˆ | 160/1444 [1:05:36<8:57:01, 25.09s/it] 11%|β–ˆ | 161/1444 [1:05:59<8:45:27, 24.57s/it] 11%|β–ˆ | 162/1444 [1:06:24<8:46:49, 24.66s/it] 11%|β–ˆβ– | 163/1444 [1:06:48<8:45:37, 24.62s/it] 11%|β–ˆβ– | 164/1444 [1:07:11<8:33:40, 24.08s/it] 11%|β–ˆβ– | 165/1444 [1:07:37<8:42:18, 24.50s/it] 11%|β–ˆβ– | 166/1444 [1:08:02<8:45:29, 24.67s/it] 12%|β–ˆβ– | 167/1444 [1:08:29<8:59:37, 25.35s/it] 12%|β–ˆβ– | 168/1444 [1:08:54<8:56:39, 25.23s/it] 12%|β–ˆβ– | 169/1444 [1:09:22<9:19:30, 26.33s/it] 12%|β–ˆβ– | 170/1444 [1:09:49<9:18:15, 26.29s/it] {'loss': 0.6526, 'grad_norm': 0.12356515228748322, 'learning_rate': 9.801749854074122e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 903.88, 'epoch': 0.24}
12%|β–ˆβ– | 170/1444 [1:09:49<9:18:15, 26.29s/it] 12%|β–ˆβ– | 171/1444 [1:10:14<9:11:06, 25.98s/it] 12%|β–ˆβ– | 172/1444 [1:10:38<8:56:25, 25.30s/it] 12%|β–ˆβ– | 173/1444 [1:11:03<8:59:04, 25.45s/it] 12%|β–ˆβ– | 174/1444 [1:11:24<8:27:22, 23.97s/it] 12%|β–ˆβ– | 175/1444 [1:11:47<8:19:02, 23.60s/it] 12%|β–ˆβ– | 176/1444 [1:12:11<8:20:49, 23.70s/it] 12%|β–ˆβ– | 177/1444 [1:12:34<8:16:38, 23.52s/it] 12%|β–ˆβ– | 178/1444 [1:12:59<8:30:09, 24.18s/it] 12%|β–ˆβ– | 179/1444 [1:13:23<8:25:02, 23.95s/it] 12%|β–ˆβ– | 180/1444 [1:13:52<8:54:41, 25.38s/it] {'loss': 0.6415, 'grad_norm': 0.11831440776586533, 'learning_rate': 9.769286629810572e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 819.58, 'epoch': 0.25}
12%|β–ˆβ– | 180/1444 [1:13:52<8:54:41, 25.38s/it] 13%|β–ˆβ–Ž | 181/1444 [1:14:16<8:48:53, 25.13s/it] 13%|β–ˆβ–Ž | 182/1444 [1:14:42<8:50:21, 25.21s/it] 13%|β–ˆβ–Ž | 183/1444 [1:15:05<8:38:21, 24.66s/it] 13%|β–ˆβ–Ž | 184/1444 [1:15:29<8:33:37, 24.46s/it] 13%|β–ˆβ–Ž | 185/1444 [1:15:54<8:38:01, 24.69s/it] 13%|β–ˆβ–Ž | 186/1444 [1:16:18<8:33:12, 24.48s/it] 13%|β–ˆβ–Ž | 187/1444 [1:16:41<8:24:57, 24.10s/it] 13%|β–ˆβ–Ž | 188/1444 [1:17:07<8:34:00, 24.55s/it] 13%|β–ˆβ–Ž | 189/1444 [1:17:29<8:20:31, 23.93s/it] 13%|β–ˆβ–Ž | 190/1444 [1:17:54<8:25:13, 24.17s/it] {'loss': 0.6685, 'grad_norm': 0.12409751862287521, 'learning_rate': 9.73442535297099e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 973.62, 'epoch': 0.26}
13%|β–ˆβ–Ž | 190/1444 [1:17:54<8:25:13, 24.17s/it] 13%|β–ˆβ–Ž | 191/1444 [1:18:20<8:33:09, 24.57s/it] 13%|β–ˆβ–Ž | 192/1444 [1:18:46<8:43:10, 25.07s/it] 13%|β–ˆβ–Ž | 193/1444 [1:19:11<8:45:44, 25.22s/it] 13%|β–ˆβ–Ž | 194/1444 [1:19:37<8:46:18, 25.26s/it] 14%|β–ˆβ–Ž | 195/1444 [1:20:03<8:52:52, 25.60s/it] 14%|β–ˆβ–Ž | 196/1444 [1:20:25<8:29:58, 24.52s/it] 14%|β–ˆβ–Ž | 197/1444 [1:20:48<8:21:18, 24.12s/it] 14%|β–ˆβ–Ž | 198/1444 [1:21:16<8:40:51, 25.08s/it] 14%|β–ˆβ– | 199/1444 [1:21:42<8:45:55, 25.35s/it] 14%|β–ˆβ– | 200/1444 [1:22:10<9:01:53, 26.14s/it] {'loss': 0.6329, 'grad_norm': 0.1352369785308838, 'learning_rate': 9.697183552209288e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 903.58, 'epoch': 0.28}
14%|β–ˆβ– | 200/1444 [1:22:10<9:01:53, 26.14s/it] 14%|β–ˆβ– | 201/1444 [1:22:33<8:43:33, 25.27s/it] 14%|β–ˆβ– | 202/1444 [1:22:57<8:37:41, 25.01s/it] 14%|β–ˆβ– | 203/1444 [1:23:20<8:20:10, 24.18s/it] 14%|β–ˆβ– | 204/1444 [1:23:45<8:25:39, 24.47s/it] 14%|β–ˆβ– | 205/1444 [1:24:06<8:08:26, 23.65s/it] 14%|β–ˆβ– | 206/1444 [1:24:29<8:02:36, 23.39s/it] 14%|β–ˆβ– | 207/1444 [1:24:54<8:10:21, 23.78s/it] 14%|β–ˆβ– | 208/1444 [1:25:15<7:54:49, 23.05s/it] 14%|β–ˆβ– | 209/1444 [1:25:39<7:58:14, 23.23s/it] 15%|β–ˆβ– | 210/1444 [1:26:06<8:23:59, 24.50s/it] {'loss': 0.6452, 'grad_norm': 0.12526443600654602, 'learning_rate': 9.657579953134383e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 859.06, 'epoch': 0.29}
15%|β–ˆβ– | 210/1444 [1:26:06<8:23:59, 24.50s/it] 15%|β–ˆβ– | 211/1444 [1:26:31<8:26:15, 24.64s/it] 15%|β–ˆβ– | 212/1444 [1:26:56<8:23:42, 24.53s/it] 15%|β–ˆβ– | 213/1444 [1:27:23<8:42:36, 25.47s/it] 15%|β–ˆβ– | 214/1444 [1:27:45<8:19:20, 24.36s/it] 15%|β–ˆβ– | 215/1444 [1:28:10<8:22:19, 24.52s/it] 15%|β–ˆβ– | 216/1444 [1:28:31<8:00:24, 23.47s/it] 15%|β–ˆβ–Œ | 217/1444 [1:28:58<8:20:12, 24.46s/it] 15%|β–ˆβ–Œ | 218/1444 [1:29:21<8:09:50, 23.97s/it] 15%|β–ˆβ–Œ | 219/1444 [1:29:45<8:14:32, 24.22s/it] 15%|β–ˆβ–Œ | 220/1444 [1:30:13<8:36:01, 25.30s/it] {'loss': 0.6407, 'grad_norm': 0.11614521592855453, 'learning_rate': 9.615634468894752e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 850.93, 'epoch': 0.3}
15%|β–ˆβ–Œ | 220/1444 [1:30:13<8:36:01, 25.30s/it] 15%|β–ˆβ–Œ | 221/1444 [1:30:34<8:09:58, 24.04s/it] 15%|β–ˆβ–Œ | 222/1444 [1:31:00<8:18:24, 24.47s/it] 15%|β–ˆβ–Œ | 223/1444 [1:31:23<8:07:23, 23.95s/it] 16%|β–ˆβ–Œ | 224/1444 [1:31:42<7:38:07, 22.53s/it] 16%|β–ˆβ–Œ | 225/1444 [1:32:03<7:31:32, 22.23s/it] 16%|β–ˆβ–Œ | 226/1444 [1:32:24<7:23:35, 21.85s/it] 16%|β–ˆβ–Œ | 227/1444 [1:32:45<7:18:49, 21.63s/it] 16%|β–ˆβ–Œ | 228/1444 [1:33:09<7:32:37, 22.33s/it] 16%|β–ˆβ–Œ | 229/1444 [1:33:31<7:27:26, 22.10s/it] 16%|β–ˆβ–Œ | 230/1444 [1:33:53<7:28:02, 22.14s/it] {'loss': 0.6741, 'grad_norm': 0.1328686773777008, 'learning_rate': 9.571368190165863e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1038.17, 'epoch': 0.32}
16%|β–ˆβ–Œ | 230/1444 [1:33:53<7:28:02, 22.14s/it] 16%|β–ˆβ–Œ | 231/1444 [1:34:20<7:58:23, 23.66s/it] 16%|β–ˆβ–Œ | 232/1444 [1:34:44<7:58:53, 23.71s/it] 16%|β–ˆβ–Œ | 233/1444 [1:35:08<7:58:53, 23.73s/it] 16%|β–ˆβ–Œ | 234/1444 [1:35:33<8:04:13, 24.01s/it] 16%|β–ˆβ–‹ | 235/1444 [1:35:55<7:53:43, 23.51s/it] 16%|β–ˆβ–‹ | 236/1444 [1:36:19<7:53:58, 23.54s/it] 16%|β–ˆβ–‹ | 237/1444 [1:36:40<7:40:00, 22.87s/it] 16%|β–ˆβ–‹ | 238/1444 [1:37:04<7:46:20, 23.20s/it] 17%|β–ˆβ–‹ | 239/1444 [1:37:30<8:05:19, 24.17s/it] 17%|β–ˆβ–‹ | 240/1444 [1:37:53<7:55:20, 23.69s/it] {'loss': 0.6906, 'grad_norm': 0.13082517683506012, 'learning_rate': 9.524803374545548e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1066.43, 'epoch': 0.33}
17%|β–ˆβ–‹ | 240/1444 [1:37:53<7:55:20, 23.69s/it] 17%|β–ˆβ–‹ | 241/1444 [1:38:18<8:04:46, 24.18s/it] 17%|β–ˆβ–‹ | 242/1444 [1:38:40<7:48:51, 23.40s/it] 17%|β–ˆβ–‹ | 243/1444 [1:39:03<7:49:48, 23.47s/it] 17%|β–ˆβ–‹ | 244/1444 [1:39:29<7:59:56, 24.00s/it] 17%|β–ˆβ–‹ | 245/1444 [1:39:53<8:00:08, 24.03s/it] 17%|β–ˆβ–‹ | 246/1444 [1:40:17<8:03:23, 24.21s/it] 17%|β–ˆβ–‹ | 247/1444 [1:40:40<7:55:57, 23.86s/it] 17%|β–ˆβ–‹ | 248/1444 [1:41:01<7:39:10, 23.04s/it] 17%|β–ˆβ–‹ | 249/1444 [1:41:22<7:23:27, 22.27s/it] 17%|β–ˆβ–‹ | 250/1444 [1:41:46<7:34:19, 22.83s/it] {'loss': 0.6609, 'grad_norm': 0.1282692551612854, 'learning_rate': 9.475963435362614e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 986.84, 'epoch': 0.35}
17%|β–ˆβ–‹ | 250/1444 [1:41:46<7:34:19, 22.83s/it] 17%|β–ˆβ–‹ | 251/1444 [1:42:10<7:40:33, 23.16s/it] 17%|β–ˆβ–‹ | 252/1444 [1:42:30<7:22:56, 22.30s/it] 18%|β–ˆβ–Š | 253/1444 [1:42:56<7:42:31, 23.30s/it] 18%|β–ˆβ–Š | 254/1444 [1:43:20<7:48:57, 23.64s/it] 18%|β–ˆβ–Š | 255/1444 [1:43:43<7:40:10, 23.22s/it] 18%|β–ˆβ–Š | 256/1444 [1:44:06<7:41:36, 23.31s/it] 18%|β–ˆβ–Š | 257/1444 [1:44:31<7:48:49, 23.70s/it] 18%|β–ˆβ–Š | 258/1444 [1:44:56<8:00:21, 24.30s/it] 18%|β–ˆβ–Š | 259/1444 [1:45:21<8:03:12, 24.47s/it] 18%|β–ˆβ–Š | 260/1444 [1:45:42<7:43:20, 23.48s/it] {'loss': 0.6169, 'grad_norm': 0.13708311319351196, 'learning_rate': 9.424872929904358e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1126.25, 'epoch': 0.36}
18%|β–ˆβ–Š | 260/1444 [1:45:42<7:43:20, 23.48s/it] 18%|β–ˆβ–Š | 261/1444 [1:46:05<7:36:54, 23.17s/it] 18%|β–ˆβ–Š | 262/1444 [1:46:29<7:41:04, 23.41s/it] 18%|β–ˆβ–Š | 263/1444 [1:46:52<7:38:47, 23.31s/it] 18%|β–ˆβ–Š | 264/1444 [1:47:16<7:45:09, 23.65s/it] 18%|β–ˆβ–Š | 265/1444 [1:47:41<7:52:00, 24.02s/it] 18%|β–ˆβ–Š | 266/1444 [1:48:06<7:57:32, 24.32s/it] 18%|β–ˆβ–Š | 267/1444 [1:48:31<7:59:30, 24.44s/it] 19%|β–ˆβ–Š | 268/1444 [1:48:56<8:00:55, 24.54s/it] 19%|β–ˆβ–Š | 269/1444 [1:49:19<7:50:41, 24.04s/it] 19%|β–ˆβ–Š | 270/1444 [1:49:40<7:36:24, 23.33s/it] {'loss': 0.6574, 'grad_norm': 0.1323172152042389, 'learning_rate': 9.371557547068878e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1062.03, 'epoch': 0.37}
19%|β–ˆβ–Š | 270/1444 [1:49:40<7:36:24, 23.33s/it] 19%|β–ˆβ–‰ | 271/1444 [1:50:07<7:58:14, 24.46s/it] 19%|β–ˆβ–‰ | 272/1444 [1:50:32<7:58:56, 24.52s/it] 19%|β–ˆβ–‰ | 273/1444 [1:50:57<8:00:48, 24.64s/it] 19%|β–ˆβ–‰ | 274/1444 [1:51:21<7:55:47, 24.40s/it] 19%|β–ˆβ–‰ | 275/1444 [1:51:47<8:08:09, 25.05s/it] 19%|β–ˆβ–‰ | 276/1444 [1:52:14<8:15:51, 25.47s/it] 19%|β–ˆβ–‰ | 277/1444 [1:52:41<8:25:04, 25.97s/it] 19%|β–ˆβ–‰ | 278/1444 [1:53:05<8:11:53, 25.31s/it] 19%|β–ˆβ–‰ | 279/1444 [1:53:32<8:23:46, 25.95s/it] 19%|β–ˆβ–‰ | 280/1444 [1:53:58<8:21:10, 25.83s/it] {'loss': 0.6583, 'grad_norm': 0.12729060649871826, 'learning_rate': 9.316044094448392e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 941.33, 'epoch': 0.39}
19%|β–ˆβ–‰ | 280/1444 [1:53:58<8:21:10, 25.83s/it] 19%|β–ˆβ–‰ | 281/1444 [1:54:20<7:59:05, 24.72s/it] 20%|β–ˆβ–‰ | 282/1444 [1:54:42<7:41:49, 23.85s/it] 20%|β–ˆβ–‰ | 283/1444 [1:55:02<7:22:31, 22.87s/it] 20%|β–ˆβ–‰ | 284/1444 [1:55:28<7:35:46, 23.57s/it] 20%|β–ˆβ–‰ | 285/1444 [1:55:51<7:36:29, 23.63s/it] 20%|β–ˆβ–‰ | 286/1444 [1:56:14<7:29:03, 23.27s/it] 20%|β–ˆβ–‰ | 287/1444 [1:56:36<7:24:18, 23.04s/it] 20%|β–ˆβ–‰ | 288/1444 [1:57:01<7:31:10, 23.42s/it] 20%|β–ˆβ–ˆ | 289/1444 [1:57:24<7:31:33, 23.46s/it] 20%|β–ˆβ–ˆ | 290/1444 [1:57:51<7:50:58, 24.49s/it] {'loss': 0.6392, 'grad_norm': 0.14719286561012268, 'learning_rate': 9.25836048485008e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 883.56, 'epoch': 0.4}
20%|β–ˆβ–ˆ | 290/1444 [1:57:51<7:50:58, 24.49s/it] 20%|β–ˆβ–ˆ | 291/1444 [1:58:12<7:30:45, 23.46s/it] 20%|β–ˆβ–ˆ | 292/1444 [1:58:39<7:49:50, 24.47s/it] 20%|β–ˆβ–ˆ | 293/1444 [1:59:00<7:31:26, 23.53s/it] 20%|β–ˆβ–ˆ | 294/1444 [1:59:25<7:38:20, 23.91s/it] 20%|β–ˆβ–ˆ | 295/1444 [1:59:47<7:28:39, 23.43s/it] 20%|β–ˆβ–ˆ | 296/1444 [2:00:13<7:38:50, 23.98s/it] 21%|β–ˆβ–ˆ | 297/1444 [2:00:37<7:39:13, 24.02s/it] 21%|β–ˆβ–ˆ | 298/1444 [2:01:03<7:51:25, 24.68s/it] 21%|β–ˆβ–ˆ | 299/1444 [2:01:24<7:29:32, 23.56s/it] 21%|β–ˆβ–ˆ | 300/1444 [2:01:52<7:53:05, 24.81s/it] {'loss': 0.6623, 'grad_norm': 0.12530402839183807, 'learning_rate': 9.198535722261181e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 902.6, 'epoch': 0.42}
21%|β–ˆβ–ˆ | 300/1444 [2:01:52<7:53:05, 24.81s/it][2025-12-27 10:33:58,454] [INFO] [axolotl.core.trainers.base._save:671] [PID:8935] Saving model checkpoint to ./outputs/qwen32b-thai/checkpoint-300
21%|β–ˆβ–ˆ | 301/1444 [2:02:19<8:05:11, 25.47s/it] 21%|β–ˆβ–ˆ | 302/1444 [2:02:44<8:02:42, 25.36s/it] 21%|β–ˆβ–ˆ | 303/1444 [2:03:08<7:54:20, 24.94s/it] 21%|β–ˆβ–ˆ | 304/1444 [2:03:32<7:50:57, 24.79s/it] 21%|β–ˆβ–ˆ | 305/1444 [2:03:53<7:27:00, 23.55s/it] 21%|β–ˆβ–ˆ | 306/1444 [2:04:19<7:39:41, 24.24s/it] 21%|β–ˆβ–ˆβ– | 307/1444 [2:04:46<7:56:51, 25.16s/it] 21%|β–ˆβ–ˆβ– | 308/1444 [2:05:15<8:16:13, 26.21s/it] 21%|β–ˆβ–ˆβ– | 309/1444 [2:05:38<8:00:19, 25.39s/it] 21%|β–ˆβ–ˆβ– | 310/1444 [2:06:01<7:47:26, 24.73s/it] {'loss': 0.645, 'grad_norm': 0.1330760419368744, 'learning_rate': 9.136599887265483e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 997.96, 'epoch': 0.43}
21%|β–ˆβ–ˆβ– | 310/1444 [2:06:01<7:47:26, 24.73s/it] 22%|β–ˆβ–ˆβ– | 311/1444 [2:06:24<7:37:07, 24.21s/it] 22%|β–ˆβ–ˆβ– | 312/1444 [2:06:48<7:34:30, 24.09s/it] 22%|β–ˆβ–ˆβ– | 313/1444 [2:07:12<7:32:04, 23.98s/it] 22%|β–ˆβ–ˆβ– | 314/1444 [2:07:36<7:31:49, 23.99s/it] 22%|β–ˆβ–ˆβ– | 315/1444 [2:07:59<7:29:34, 23.89s/it] 22%|β–ˆβ–ˆβ– | 316/1444 [2:08:26<7:44:09, 24.69s/it] 22%|β–ˆβ–ˆβ– | 317/1444 [2:08:52<7:52:13, 25.14s/it] 22%|β–ˆβ–ˆβ– | 318/1444 [2:09:14<7:31:12, 24.04s/it] 22%|β–ˆβ–ˆβ– | 319/1444 [2:09:36<7:20:50, 23.51s/it] 22%|β–ˆβ–ˆβ– | 320/1444 [2:10:00<7:25:11, 23.76s/it] {'loss': 0.6139, 'grad_norm': 0.13317464292049408, 'learning_rate': 9.072584121918425e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 980.08, 'epoch': 0.44}
22%|β–ˆβ–ˆβ– | 320/1444 [2:10:00<7:25:11, 23.76s/it] 22%|β–ˆβ–ˆβ– | 321/1444 [2:10:22<7:14:12, 23.20s/it] 22%|β–ˆβ–ˆβ– | 322/1444 [2:10:46<7:15:18, 23.28s/it] 22%|β–ˆβ–ˆβ– | 323/1444 [2:11:13<7:39:26, 24.59s/it] 22%|β–ˆβ–ˆβ– | 324/1444 [2:11:40<7:51:46, 25.27s/it] 23%|β–ˆβ–ˆβ–Ž | 325/1444 [2:12:07<8:00:40, 25.77s/it] 23%|β–ˆβ–ˆβ–Ž | 326/1444 [2:12:30<7:46:06, 25.01s/it] 23%|β–ˆβ–ˆβ–Ž | 327/1444 [2:12:57<7:52:53, 25.40s/it] 23%|β–ˆβ–ˆβ–Ž | 328/1444 [2:13:19<7:35:30, 24.49s/it] 23%|β–ˆβ–ˆβ–Ž | 329/1444 [2:13:41<7:18:32, 23.60s/it] 23%|β–ˆβ–ˆβ–Ž | 330/1444 [2:14:05<7:24:29, 23.94s/it] {'loss': 0.6658, 'grad_norm': 0.12773385643959045, 'learning_rate': 9.006520614088535e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 937.57, 'epoch': 0.46}
23%|β–ˆβ–ˆβ–Ž | 330/1444 [2:14:05<7:24:29, 23.94s/it] 23%|β–ˆβ–ˆβ–Ž | 331/1444 [2:14:31<7:32:40, 24.40s/it] 23%|β–ˆβ–ˆβ–Ž | 332/1444 [2:14:56<7:37:18, 24.67s/it] 23%|β–ˆβ–ˆβ–Ž | 333/1444 [2:15:23<7:47:19, 25.24s/it] 23%|β–ˆβ–ˆβ–Ž | 334/1444 [2:15:49<7:53:07, 25.57s/it] 23%|β–ˆβ–ˆβ–Ž | 335/1444 [2:16:12<7:40:50, 24.93s/it] 23%|β–ˆβ–ˆβ–Ž | 336/1444 [2:16:33<7:16:42, 23.65s/it] 23%|β–ˆβ–ˆβ–Ž | 337/1444 [2:16:59<7:26:48, 24.22s/it] 23%|β–ˆβ–ˆβ–Ž | 338/1444 [2:17:23<7:26:41, 24.23s/it] 23%|β–ˆβ–ˆβ–Ž | 339/1444 [2:17:48<7:30:08, 24.44s/it] 24%|β–ˆβ–ˆβ–Ž | 340/1444 [2:18:11<7:21:35, 24.00s/it] {'loss': 0.6737, 'grad_norm': 0.13415341079235077, 'learning_rate': 8.938442581272983e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1046.92, 'epoch': 0.47}
24%|β–ˆβ–ˆβ–Ž | 340/1444 [2:18:11<7:21:35, 24.00s/it] 24%|β–ˆβ–ˆβ–Ž | 341/1444 [2:18:34<7:17:51, 23.82s/it] 24%|β–ˆβ–ˆβ–Ž | 342/1444 [2:18:59<7:20:39, 23.99s/it] 24%|β–ˆβ–ˆβ– | 343/1444 [2:19:23<7:21:04, 24.04s/it] 24%|β–ˆβ–ˆβ– | 344/1444 [2:19:45<7:09:39, 23.44s/it] 24%|β–ˆβ–ˆβ– | 345/1444 [2:20:09<7:15:13, 23.76s/it] 24%|β–ˆβ–ˆβ– | 346/1444 [2:20:34<7:22:45, 24.19s/it] 24%|β–ˆβ–ˆβ– | 347/1444 [2:21:01<7:37:32, 25.02s/it] 24%|β–ˆβ–ˆβ– | 348/1444 [2:21:27<7:40:14, 25.20s/it] 24%|β–ˆβ–ˆβ– | 349/1444 [2:21:51<7:32:16, 24.78s/it] 24%|β–ˆβ–ˆβ– | 350/1444 [2:22:15<7:27:30, 24.54s/it] {'loss': 0.6575, 'grad_norm': 0.13382680714130402, 'learning_rate': 8.868384253895445e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1049.72, 'epoch': 0.49}
24%|β–ˆβ–ˆβ– | 350/1444 [2:22:15<7:27:30, 24.54s/it] 24%|β–ˆβ–ˆβ– | 351/1444 [2:22:38<7:20:18, 24.17s/it] 24%|β–ˆβ–ˆβ– | 352/1444 [2:23:03<7:24:57, 24.45s/it] 24%|β–ˆβ–ˆβ– | 353/1444 [2:23:26<7:13:31, 23.84s/it] 25%|β–ˆβ–ˆβ– | 354/1444 [2:23:48<7:03:28, 23.31s/it] 25%|β–ˆβ–ˆβ– | 355/1444 [2:24:17<7:37:51, 25.23s/it] 25%|β–ˆβ–ˆβ– | 356/1444 [2:24:40<7:22:00, 24.38s/it] 25%|β–ˆβ–ˆβ– | 357/1444 [2:25:03<7:13:44, 23.94s/it] 25%|β–ˆβ–ˆβ– | 358/1444 [2:25:24<6:59:16, 23.16s/it] 25%|β–ˆβ–ˆβ– | 359/1444 [2:25:47<6:57:37, 23.09s/it] 25%|β–ˆβ–ˆβ– | 360/1444 [2:26:13<7:11:57, 23.91s/it] {'loss': 0.6423, 'grad_norm': 0.12621234357357025, 'learning_rate': 8.796380858094643e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 926.35, 'epoch': 0.5}
25%|β–ˆβ–ˆβ– | 360/1444 [2:26:13<7:11:57, 23.91s/it] 25%|β–ˆβ–ˆβ–Œ | 361/1444 [2:26:36<7:08:05, 23.72s/it] 25%|β–ˆβ–ˆβ–Œ | 362/1444 [2:27:00<7:10:03, 23.85s/it] 25%|β–ˆβ–ˆβ–Œ | 363/1444 [2:27:22<7:00:04, 23.32s/it] 25%|β–ˆβ–ˆβ–Œ | 364/1444 [2:27:49<7:17:53, 24.33s/it] 25%|β–ˆβ–ˆβ–Œ | 365/1444 [2:28:10<7:02:01, 23.47s/it] 25%|β–ˆβ–ˆβ–Œ | 366/1444 [2:28:36<7:14:30, 24.18s/it] 25%|β–ˆβ–ˆβ–Œ | 367/1444 [2:29:01<7:14:20, 24.20s/it] 25%|β–ˆβ–ˆβ–Œ | 368/1444 [2:29:22<6:57:35, 23.29s/it] 26%|β–ˆβ–ˆβ–Œ | 369/1444 [2:29:46<7:02:43, 23.59s/it] 26%|β–ˆβ–ˆβ–Œ | 370/1444 [2:30:10<7:02:16, 23.59s/it] {'loss': 0.6524, 'grad_norm': 0.14663882553577423, 'learning_rate': 8.722468598012245e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 993.98, 'epoch': 0.51}
26%|β–ˆβ–ˆβ–Œ | 370/1444 [2:30:10<7:02:16, 23.59s/it] 26%|β–ˆβ–ˆβ–Œ | 371/1444 [2:30:31<6:49:23, 22.89s/it] 26%|β–ˆβ–ˆβ–Œ | 372/1444 [2:30:59<7:19:12, 24.58s/it] 26%|β–ˆβ–ˆβ–Œ | 373/1444 [2:31:28<7:37:50, 25.65s/it] 26%|β–ˆβ–ˆβ–Œ | 374/1444 [2:31:51<7:26:39, 25.05s/it] 26%|β–ˆβ–ˆβ–Œ | 375/1444 [2:32:15<7:17:38, 24.56s/it] 26%|β–ˆβ–ˆβ–Œ | 376/1444 [2:32:41<7:26:54, 25.11s/it] 26%|β–ˆβ–ˆβ–Œ | 377/1444 [2:33:07<7:28:48, 25.24s/it] 26%|β–ˆβ–ˆβ–Œ | 378/1444 [2:33:31<7:24:36, 25.02s/it] 26%|β–ˆβ–ˆβ–Œ | 379/1444 [2:33:56<7:26:18, 25.14s/it] 26%|β–ˆβ–ˆβ–‹ | 380/1444 [2:34:24<7:38:24, 25.85s/it] {'loss': 0.6158, 'grad_norm': 0.12107036262750626, 'learning_rate': 8.646684637588991e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 882.74, 'epoch': 0.53}
26%|β–ˆβ–ˆβ–‹ | 380/1444 [2:34:24<7:38:24, 25.85s/it] 26%|β–ˆβ–ˆβ–‹ | 381/1444 [2:34:48<7:29:46, 25.39s/it] 26%|β–ˆβ–ˆβ–‹ | 382/1444 [2:35:13<7:26:07, 25.20s/it] 27%|β–ˆβ–ˆβ–‹ | 383/1444 [2:35:37<7:17:57, 24.77s/it] 27%|β–ˆβ–ˆβ–‹ | 384/1444 [2:36:02<7:19:45, 24.89s/it] 27%|β–ˆβ–ˆβ–‹ | 385/1444 [2:36:29<7:32:20, 25.63s/it] 27%|β–ˆβ–ˆβ–‹ | 386/1444 [2:36:53<7:23:07, 25.13s/it] 27%|β–ˆβ–ˆβ–‹ | 387/1444 [2:37:21<7:36:15, 25.90s/it] 27%|β–ˆβ–ˆβ–‹ | 388/1444 [2:37:45<7:25:17, 25.30s/it] 27%|β–ˆβ–ˆβ–‹ | 389/1444 [2:38:09<7:17:47, 24.90s/it] 27%|β–ˆβ–ˆβ–‹ | 390/1444 [2:38:33<7:11:58, 24.59s/it] {'loss': 0.6359, 'grad_norm': 0.12905746698379517, 'learning_rate': 8.56906708187824e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 992.61, 'epoch': 0.54}
27%|β–ˆβ–ˆβ–‹ | 390/1444 [2:38:33<7:11:58, 24.59s/it] 27%|β–ˆβ–ˆβ–‹ | 391/1444 [2:38:56<7:04:06, 24.17s/it] 27%|β–ˆβ–ˆβ–‹ | 392/1444 [2:39:19<6:58:23, 23.86s/it] 27%|β–ˆβ–ˆβ–‹ | 393/1444 [2:39:46<7:13:30, 24.75s/it] 27%|β–ˆβ–ˆβ–‹ | 394/1444 [2:40:12<7:21:40, 25.24s/it] 27%|β–ˆβ–ˆβ–‹ | 395/1444 [2:40:38<7:23:50, 25.39s/it] 27%|β–ˆβ–ˆβ–‹ | 396/1444 [2:41:03<7:23:58, 25.42s/it] 27%|β–ˆβ–ˆβ–‹ | 397/1444 [2:41:27<7:14:14, 24.88s/it] 28%|β–ˆβ–ˆβ–Š | 398/1444 [2:41:54<7:23:24, 25.43s/it] 28%|β–ˆβ–ˆβ–Š | 399/1444 [2:42:19<7:18:58, 25.20s/it] 28%|β–ˆβ–ˆβ–Š | 400/1444 [2:42:45<7:26:54, 25.68s/it] {'loss': 0.6124, 'grad_norm': 0.14433123171329498, 'learning_rate': 8.489654957886306e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 871.5, 'epoch': 0.55}
28%|β–ˆβ–ˆβ–Š | 400/1444 [2:42:45<7:26:54, 25.68s/it] 28%|β–ˆβ–ˆβ–Š | 401/1444 [2:43:10<7:20:46, 25.36s/it] 28%|β–ˆβ–ˆβ–Š | 402/1444 [2:43:37<7:27:04, 25.74s/it] 28%|β–ˆβ–ˆβ–Š | 403/1444 [2:44:01<7:21:12, 25.43s/it] 28%|β–ˆβ–ˆβ–Š | 404/1444 [2:44:24<7:08:23, 24.71s/it] 28%|β–ˆβ–ˆβ–Š | 405/1444 [2:44:47<6:59:28, 24.22s/it] 28%|β–ˆβ–ˆβ–Š | 406/1444 [2:45:10<6:50:13, 23.71s/it] 28%|β–ˆβ–ˆβ–Š | 407/1444 [2:45:33<6:47:40, 23.59s/it] 28%|β–ˆβ–ˆβ–Š | 408/1444 [2:45:59<6:58:10, 24.22s/it] 28%|β–ˆβ–ˆβ–Š | 409/1444 [2:46:23<6:55:20, 24.08s/it] 28%|β–ˆβ–ˆβ–Š | 410/1444 [2:46:48<7:00:50, 24.42s/it] {'loss': 0.6803, 'grad_norm': 0.13294072449207306, 'learning_rate': 8.40848819494923e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 950.11, 'epoch': 0.57}
28%|β–ˆβ–ˆβ–Š | 410/1444 [2:46:48<7:00:50, 24.42s/it] 28%|β–ˆβ–ˆβ–Š | 411/1444 [2:47:11<6:53:06, 23.99s/it] 29%|β–ˆβ–ˆβ–Š | 412/1444 [2:47:34<6:46:41, 23.64s/it] 29%|β–ˆβ–ˆβ–Š | 413/1444 [2:47:57<6:42:45, 23.44s/it] 29%|β–ˆβ–ˆβ–Š | 414/1444 [2:48:18<6:34:11, 22.96s/it] 29%|β–ˆβ–ˆβ–Š | 415/1444 [2:48:44<6:49:06, 23.85s/it] 29%|β–ˆβ–ˆβ–‰ | 416/1444 [2:49:08<6:47:45, 23.80s/it] 29%|β–ˆβ–ˆβ–‰ | 417/1444 [2:49:33<6:53:29, 24.16s/it] 29%|β–ˆβ–ˆβ–‰ | 418/1444 [2:49:57<6:51:10, 24.05s/it] 29%|β–ˆβ–ˆβ–‰ | 419/1444 [2:50:21<6:50:09, 24.01s/it] 29%|β–ˆβ–ˆβ–‰ | 420/1444 [2:50:43<6:42:08, 23.56s/it] {'loss': 0.6088, 'grad_norm': 0.1526036411523819, 'learning_rate': 8.325607604655839e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1025.1, 'epoch': 0.58}
29%|β–ˆβ–ˆβ–‰ | 420/1444 [2:50:43<6:42:08, 23.56s/it] 29%|β–ˆβ–ˆβ–‰ | 421/1444 [2:51:08<6:48:13, 23.94s/it] 29%|β–ˆβ–ˆβ–‰ | 422/1444 [2:51:31<6:40:30, 23.51s/it] 29%|β–ˆβ–ˆβ–‰ | 423/1444 [2:51:53<6:34:54, 23.21s/it] 29%|β–ˆβ–ˆβ–‰ | 424/1444 [2:52:17<6:38:04, 23.42s/it] 29%|β–ˆβ–ˆβ–‰ | 425/1444 [2:52:43<6:50:03, 24.14s/it] 30%|β–ˆβ–ˆβ–‰ | 426/1444 [2:53:07<6:48:32, 24.08s/it] 30%|β–ˆβ–ˆβ–‰ | 427/1444 [2:53:33<6:58:37, 24.70s/it] 30%|β–ˆβ–ˆβ–‰ | 428/1444 [2:54:01<7:14:17, 25.65s/it] 30%|β–ˆβ–ˆβ–‰ | 429/1444 [2:54:26<7:13:11, 25.61s/it] 30%|β–ˆβ–ˆβ–‰ | 430/1444 [2:54:49<6:55:44, 24.60s/it] {'loss': 0.6669, 'grad_norm': 0.1453717052936554, 'learning_rate': 8.241054860327216e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1074.36, 'epoch': 0.6}
30%|β–ˆβ–ˆβ–‰ | 430/1444 [2:54:49<6:55:44, 24.60s/it] 30%|β–ˆβ–ˆβ–‰ | 431/1444 [2:55:15<7:02:07, 25.00s/it] 30%|β–ˆβ–ˆβ–‰ | 432/1444 [2:55:38<6:54:22, 24.57s/it] 30%|β–ˆβ–ˆβ–‰ | 433/1444 [2:56:03<6:56:12, 24.70s/it] 30%|β–ˆβ–ˆβ–ˆ | 434/1444 [2:56:32<7:16:54, 25.95s/it] 30%|β–ˆβ–ˆβ–ˆ | 435/1444 [2:56:59<7:23:55, 26.40s/it] 30%|β–ˆβ–ˆβ–ˆ | 436/1444 [2:57:24<7:14:33, 25.87s/it] 30%|β–ˆβ–ˆβ–ˆ | 437/1444 [2:57:48<7:05:04, 25.33s/it] 30%|β–ˆβ–ˆβ–ˆ | 438/1444 [2:58:14<7:05:03, 25.35s/it] 30%|β–ˆβ–ˆβ–ˆ | 439/1444 [2:58:36<6:50:49, 24.53s/it] 30%|β–ˆβ–ˆβ–ˆ | 440/1444 [2:58:59<6:44:12, 24.16s/it] {'loss': 0.6147, 'grad_norm': 0.1466919481754303, 'learning_rate': 8.154872476062868e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1010.83, 'epoch': 0.61}
30%|β–ˆβ–ˆβ–ˆ | 440/1444 [2:58:59<6:44:12, 24.16s/it] 31%|β–ˆβ–ˆβ–ˆ | 441/1444 [2:59:26<6:55:32, 24.86s/it] 31%|β–ˆβ–ˆβ–ˆ | 442/1444 [2:59:52<7:01:49, 25.26s/it] 31%|β–ˆβ–ˆβ–ˆ | 443/1444 [3:00:18<7:03:23, 25.38s/it] 31%|β–ˆβ–ˆβ–ˆ | 444/1444 [3:00:43<7:00:21, 25.22s/it] 31%|β–ˆβ–ˆβ–ˆ | 445/1444 [3:01:08<7:00:04, 25.23s/it] 31%|β–ˆβ–ˆβ–ˆ | 446/1444 [3:01:32<6:55:51, 25.00s/it] 31%|β–ˆβ–ˆβ–ˆ | 447/1444 [3:01:59<7:04:38, 25.56s/it] 31%|β–ˆβ–ˆβ–ˆ | 448/1444 [3:02:22<6:50:12, 24.71s/it] 31%|β–ˆβ–ˆβ–ˆ | 449/1444 [3:02:46<6:45:49, 24.47s/it] 31%|β–ˆβ–ˆβ–ˆ | 450/1444 [3:03:11<6:48:23, 24.65s/it] {'loss': 0.6096, 'grad_norm': 0.12707076966762543, 'learning_rate': 8.067103785364139e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 919.83, 'epoch': 0.62}
31%|β–ˆβ–ˆβ–ˆ | 450/1444 [3:03:11<6:48:23, 24.65s/it] 31%|β–ˆβ–ˆβ–ˆ | 451/1444 [3:03:37<6:52:49, 24.94s/it] 31%|β–ˆβ–ˆβ–ˆβ– | 452/1444 [3:04:02<6:53:28, 25.01s/it] 31%|β–ˆβ–ˆβ–ˆβ– | 453/1444 [3:04:25<6:47:06, 24.65s/it] 31%|β–ˆβ–ˆβ–ˆβ– | 454/1444 [3:04:49<6:43:30, 24.46s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 455/1444 [3:05:13<6:40:29, 24.30s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 456/1444 [3:05:35<6:26:49, 23.49s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 457/1444 [3:06:01<6:36:24, 24.10s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 458/1444 [3:06:26<6:44:02, 24.59s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 459/1444 [3:06:50<6:39:47, 24.35s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 460/1444 [3:07:13<6:29:58, 23.78s/it] {'loss': 0.6342, 'grad_norm': 0.13485883176326752, 'learning_rate': 7.977792919345633e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1033.87, 'epoch': 0.64}
32%|β–ˆβ–ˆβ–ˆβ– | 460/1444 [3:07:13<6:29:58, 23.78s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 461/1444 [3:07:37<6:34:40, 24.09s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 462/1444 [3:08:06<6:56:35, 25.45s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 463/1444 [3:08:32<7:00:33, 25.72s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 464/1444 [3:08:57<6:54:49, 25.40s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 465/1444 [3:09:25<7:07:40, 26.21s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 466/1444 [3:09:48<6:52:23, 25.30s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 467/1444 [3:10:16<7:01:32, 25.89s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 468/1444 [3:10:41<7:00:53, 25.87s/it] 32%|β–ˆβ–ˆβ–ˆβ– | 469/1444 [3:11:05<6:49:38, 25.21s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 470/1444 [3:11:29<6:44:21, 24.91s/it] {'loss': 0.6256, 'grad_norm': 0.12489234656095505, 'learning_rate': 7.886984784545566e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1018.58, 'epoch': 0.65}
33%|β–ˆβ–ˆβ–ˆβ–Ž | 470/1444 [3:11:29<6:44:21, 24.91s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 471/1444 [3:11:53<6:36:52, 24.47s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 472/1444 [3:12:14<6:18:45, 23.38s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 473/1444 [3:12:38<6:25:42, 23.83s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 474/1444 [3:13:01<6:17:58, 23.38s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 475/1444 [3:13:28<6:36:52, 24.57s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 476/1444 [3:13:54<6:42:08, 24.93s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 477/1444 [3:14:16<6:29:12, 24.15s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 478/1444 [3:14:40<6:25:51, 23.97s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 479/1444 [3:15:04<6:28:04, 24.13s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 480/1444 [3:15:29<6:32:40, 24.44s/it] {'loss': 0.6455, 'grad_norm': 0.16094225645065308, 'learning_rate': 7.794725040346251e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 933.16, 'epoch': 0.67}
33%|β–ˆβ–ˆβ–ˆβ–Ž | 480/1444 [3:15:29<6:32:40, 24.44s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 481/1444 [3:15:53<6:29:23, 24.26s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 482/1444 [3:16:19<6:35:50, 24.69s/it] 33%|β–ˆβ–ˆβ–ˆβ–Ž | 483/1444 [3:16:44<6:39:32, 24.95s/it] 34%|β–ˆβ–ˆβ–ˆβ–Ž | 484/1444 [3:17:07<6:26:39, 24.17s/it] 34%|β–ˆβ–ˆβ–ˆβ–Ž | 485/1444 [3:17:30<6:20:30, 23.81s/it] 34%|β–ˆβ–ˆβ–ˆβ–Ž | 486/1444 [3:17:52<6:11:47, 23.29s/it] 34%|β–ˆβ–ˆβ–ˆβ–Ž | 487/1444 [3:18:17<6:21:41, 23.93s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 488/1444 [3:18:39<6:12:07, 23.36s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 489/1444 [3:19:02<6:06:51, 23.05s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 490/1444 [3:19:26<6:13:43, 23.50s/it] {'loss': 0.6613, 'grad_norm': 0.1351306140422821, 'learning_rate': 7.701060076016024e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 970.62, 'epoch': 0.68}
34%|β–ˆβ–ˆβ–ˆβ– | 490/1444 [3:19:26<6:13:43, 23.50s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 491/1444 [3:19:54<6:35:03, 24.87s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 492/1444 [3:20:19<6:34:55, 24.89s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 493/1444 [3:20:43<6:31:16, 24.69s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 494/1444 [3:21:08<6:29:14, 24.58s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 495/1444 [3:21:29<6:15:14, 23.72s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 496/1444 [3:21:53<6:15:58, 23.80s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 497/1444 [3:22:20<6:28:04, 24.59s/it] 34%|β–ˆβ–ˆβ–ˆβ– | 498/1444 [3:22:46<6:35:09, 25.06s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 499/1444 [3:23:12<6:40:40, 25.44s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 500/1444 [3:23:38<6:39:46, 25.41s/it] {'loss': 0.6186, 'grad_norm': 0.1223958283662796, 'learning_rate': 7.606036987384184e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 925.45, 'epoch': 0.69}
35%|β–ˆβ–ˆβ–ˆβ– | 500/1444 [3:23:38<6:39:46, 25.41s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 501/1444 [3:24:02<6:33:54, 25.06s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 502/1444 [3:24:28<6:39:23, 25.44s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 503/1444 [3:24:55<6:44:18, 25.78s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 504/1444 [3:25:21<6:45:23, 25.88s/it] 35%|β–ˆβ–ˆβ–ˆβ– | 505/1444 [3:25:44<6:33:33, 25.15s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 506/1444 [3:26:09<6:29:00, 24.88s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 507/1444 [3:26:31<6:18:48, 24.26s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 508/1444 [3:27:00<6:37:28, 25.48s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 509/1444 [3:27:25<6:35:48, 25.40s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 510/1444 [3:27:49<6:29:35, 25.03s/it] {'loss': 0.646, 'grad_norm': 0.11873335391283035, 'learning_rate': 7.509703553160666e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 990.93, 'epoch': 0.71}
35%|β–ˆβ–ˆβ–ˆβ–Œ | 510/1444 [3:27:49<6:29:35, 25.03s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 511/1444 [3:28:17<6:44:21, 26.00s/it] 35%|β–ˆβ–ˆβ–ˆβ–Œ | 512/1444 [3:28:41<6:30:52, 25.16s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 513/1444 [3:29:07<6:35:48, 25.51s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 514/1444 [3:29:32<6:33:23, 25.38s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 515/1444 [3:29:59<6:40:09, 25.84s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 516/1444 [3:30:24<6:36:50, 25.66s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 517/1444 [3:30:48<6:26:19, 25.00s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 518/1444 [3:31:11<6:16:14, 24.38s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 519/1444 [3:31:32<6:02:46, 23.53s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 520/1444 [3:31:57<6:06:57, 23.83s/it] {'loss': 0.6155, 'grad_norm': 0.12990343570709229, 'learning_rate': 7.412108210912345e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 955.97, 'epoch': 0.72}
36%|β–ˆβ–ˆβ–ˆβ–Œ | 520/1444 [3:31:57<6:06:57, 23.83s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 521/1444 [3:32:19<6:01:15, 23.48s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 522/1444 [3:32:45<6:09:59, 24.08s/it] 36%|β–ˆβ–ˆβ–ˆβ–Œ | 523/1444 [3:33:11<6:18:03, 24.63s/it] 36%|β–ˆβ–ˆβ–ˆβ–‹ | 524/1444 [3:33:35<6:18:22, 24.68s/it] 36%|β–ˆβ–ˆβ–ˆβ–‹ | 525/1444 [3:34:02<6:26:50, 25.26s/it] 36%|β–ˆβ–ˆβ–ˆβ–‹ | 526/1444 [3:34:25<6:15:55, 24.57s/it] 36%|β–ˆβ–ˆβ–ˆβ–‹ | 527/1444 [3:34:51<6:21:49, 24.98s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 528/1444 [3:35:13<6:09:23, 24.20s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 529/1444 [3:35:40<6:20:08, 24.93s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 530/1444 [3:36:07<6:29:15, 25.55s/it] {'loss': 0.6443, 'grad_norm': 0.1376057118177414, 'learning_rate': 7.31330003270808e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 895.07, 'epoch': 0.73}
37%|β–ˆβ–ˆβ–ˆβ–‹ | 530/1444 [3:36:07<6:29:15, 25.55s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 531/1444 [3:36:31<6:21:41, 25.08s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 532/1444 [3:36:55<6:15:49, 24.72s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 533/1444 [3:37:19<6:13:43, 24.61s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 534/1444 [3:37:46<6:21:15, 25.14s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 535/1444 [3:38:08<6:08:57, 24.35s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 536/1444 [3:38:29<5:52:33, 23.30s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 537/1444 [3:38:54<5:58:50, 23.74s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 538/1444 [3:39:21<6:15:47, 24.89s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 539/1444 [3:39:48<6:25:29, 25.56s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 540/1444 [3:40:15<6:27:54, 25.75s/it] {'loss': 0.6188, 'grad_norm': 0.13277359306812286, 'learning_rate': 7.213328700444696e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 907.27, 'epoch': 0.75}
37%|β–ˆβ–ˆβ–ˆβ–‹ | 540/1444 [3:40:15<6:27:54, 25.75s/it] 37%|β–ˆβ–ˆβ–ˆβ–‹ | 541/1444 [3:40:41<6:30:50, 25.97s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 542/1444 [3:41:05<6:22:54, 25.47s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 543/1444 [3:41:27<6:03:37, 24.21s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 544/1444 [3:41:55<6:23:28, 25.56s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 545/1444 [3:42:22<6:25:35, 25.73s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 546/1444 [3:42:48<6:27:58, 25.92s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 547/1444 [3:43:14<6:28:31, 25.99s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 548/1444 [3:43:37<6:16:00, 25.18s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 549/1444 [3:44:02<6:12:37, 24.98s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 550/1444 [3:44:26<6:06:49, 24.62s/it] {'loss': 0.6471, 'grad_norm': 0.13623632490634918, 'learning_rate': 7.112244480866356e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1002.06, 'epoch': 0.76}
38%|β–ˆβ–ˆβ–ˆβ–Š | 550/1444 [3:44:26<6:06:49, 24.62s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 551/1444 [3:44:52<6:12:41, 25.04s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 552/1444 [3:45:19<6:20:51, 25.62s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 553/1444 [3:45:42<6:08:21, 24.80s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 554/1444 [3:46:09<6:21:31, 25.72s/it] 38%|β–ˆβ–ˆβ–ˆβ–Š | 555/1444 [3:46:33<6:12:22, 25.13s/it] 39%|β–ˆβ–ˆβ–ˆβ–Š | 556/1444 [3:46:54<5:51:45, 23.77s/it] 39%|β–ˆβ–ˆβ–ˆβ–Š | 557/1444 [3:47:17<5:50:45, 23.73s/it] 39%|β–ˆβ–ˆβ–ˆβ–Š | 558/1444 [3:47:43<5:59:00, 24.31s/it] 39%|β–ˆβ–ˆβ–ˆβ–Š | 559/1444 [3:48:08<6:00:06, 24.41s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 560/1444 [3:48:30<5:50:05, 23.76s/it] {'loss': 0.647, 'grad_norm': 0.13037075102329254, 'learning_rate': 7.010098200289859e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1047.58, 'epoch': 0.78}
39%|β–ˆβ–ˆβ–ˆβ–‰ | 560/1444 [3:48:30<5:50:05, 23.76s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 561/1444 [3:48:53<5:46:59, 23.58s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 562/1444 [3:49:16<5:43:44, 23.38s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 563/1444 [3:49:39<5:43:27, 23.39s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 564/1444 [3:50:03<5:43:43, 23.44s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 565/1444 [3:50:29<5:53:40, 24.14s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 566/1444 [3:50:56<6:06:57, 25.08s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 567/1444 [3:51:17<5:48:39, 23.85s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 568/1444 [3:51:39<5:40:55, 23.35s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 569/1444 [3:52:07<5:57:53, 24.54s/it] 39%|β–ˆβ–ˆβ–ˆβ–‰ | 570/1444 [3:52:29<5:46:46, 23.81s/it] {'loss': 0.6071, 'grad_norm': 0.12997141480445862, 'learning_rate': 6.906941219048584e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1073.59, 'epoch': 0.79}
39%|β–ˆβ–ˆβ–ˆβ–‰ | 570/1444 [3:52:29<5:46:46, 23.81s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 571/1444 [3:52:53<5:50:10, 24.07s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 572/1444 [3:53:16<5:42:51, 23.59s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 573/1444 [3:53:44<6:01:52, 24.93s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 574/1444 [3:54:06<5:50:15, 24.16s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 575/1444 [3:54:27<5:33:44, 23.04s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 576/1444 [3:54:51<5:37:19, 23.32s/it] 40%|β–ˆβ–ˆβ–ˆβ–‰ | 577/1444 [3:55:14<5:37:45, 23.37s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 578/1444 [3:55:40<5:46:53, 24.03s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 579/1444 [3:56:03<5:44:36, 23.90s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 580/1444 [3:56:25<5:33:50, 23.18s/it] {'loss': 0.6101, 'grad_norm': 0.14118416607379913, 'learning_rate': 6.802825405667905e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1097.31, 'epoch': 0.8}
40%|β–ˆβ–ˆβ–ˆβ–ˆ | 580/1444 [3:56:25<5:33:50, 23.18s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 581/1444 [3:56:48<5:32:06, 23.09s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 582/1444 [3:57:12<5:38:53, 23.59s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 583/1444 [3:57:34<5:32:00, 23.14s/it] 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 584/1444 [3:58:02<5:51:17, 24.51s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 585/1444 [3:58:22<5:30:23, 23.08s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 586/1444 [3:58:44<5:25:01, 22.73s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 587/1444 [3:59:07<5:26:11, 22.84s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 588/1444 [3:59:33<5:39:28, 23.79s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 589/1444 [3:59:59<5:47:36, 24.39s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 590/1444 [4:00:24<5:49:59, 24.59s/it] {'loss': 0.6084, 'grad_norm': 0.126139834523201, 'learning_rate': 6.697803110785115e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 933.94, 'epoch': 0.82}
41%|β–ˆβ–ˆβ–ˆβ–ˆ | 590/1444 [4:00:24<5:49:59, 24.59s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 591/1444 [4:00:48<5:46:40, 24.39s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 592/1444 [4:01:15<5:57:46, 25.19s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 593/1444 [4:01:41<6:00:18, 25.40s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 594/1444 [4:02:04<5:50:30, 24.74s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆ | 595/1444 [4:02:31<6:00:13, 25.46s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 596/1444 [4:02:54<5:51:31, 24.87s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 597/1444 [4:03:18<5:44:12, 24.38s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 598/1444 [4:03:42<5:41:23, 24.21s/it] 41%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 599/1444 [4:04:05<5:37:43, 23.98s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 600/1444 [4:04:29<5:37:25, 23.99s/it] {'loss': 0.6416, 'grad_norm': 0.1207822933793068, 'learning_rate': 6.591927140826902e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1007.52, 'epoch': 0.83}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 600/1444 [4:04:29<5:37:25, 23.99s/it][2025-12-27 12:36:35,854] [INFO] [axolotl.core.trainers.base._save:671] [PID:8935] Saving model checkpoint to ./outputs/qwen32b-thai/checkpoint-600
42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 601/1444 [4:04:56<5:49:19, 24.86s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 602/1444 [4:05:17<5:34:36, 23.84s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 603/1444 [4:05:44<5:46:01, 24.69s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 604/1444 [4:06:12<6:01:22, 25.81s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 605/1444 [4:06:39<6:03:54, 26.02s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 606/1444 [4:07:00<5:43:36, 24.60s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 607/1444 [4:07:24<5:38:20, 24.25s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 608/1444 [4:07:49<5:44:11, 24.70s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 609/1444 [4:08:11<5:32:06, 23.86s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 610/1444 [4:08:38<5:45:23, 24.85s/it] {'loss': 0.6102, 'grad_norm': 0.1316983848810196, 'learning_rate': 6.485250731457678e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 866.4, 'epoch': 0.85}
42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 610/1444 [4:08:38<5:45:23, 24.85s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 611/1444 [4:09:04<5:46:26, 24.95s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 612/1444 [4:09:28<5:42:07, 24.67s/it] 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 613/1444 [4:09:49<5:27:36, 23.65s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 614/1444 [4:10:15<5:35:24, 24.25s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 615/1444 [4:10:39<5:35:45, 24.30s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 616/1444 [4:11:04<5:39:30, 24.60s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 617/1444 [4:11:31<5:47:44, 25.23s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 618/1444 [4:11:57<5:52:13, 25.59s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 619/1444 [4:12:19<5:33:55, 24.29s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 620/1444 [4:12:41<5:27:19, 23.83s/it] {'loss': 0.6426, 'grad_norm': 0.13538379967212677, 'learning_rate': 6.377827520812061e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1044.49, 'epoch': 0.86}
43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 620/1444 [4:12:41<5:27:19, 23.83s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 621/1444 [4:13:08<5:37:39, 24.62s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 622/1444 [4:13:33<5:40:36, 24.86s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 623/1444 [4:13:59<5:43:07, 25.08s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 624/1444 [4:14:21<5:29:53, 24.14s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 625/1444 [4:14:42<5:17:49, 23.28s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 626/1444 [4:15:09<5:31:33, 24.32s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 627/1444 [4:15:33<5:29:12, 24.18s/it] 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 628/1444 [4:15:54<5:16:05, 23.24s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 629/1444 [4:16:21<5:33:37, 24.56s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 630/1444 [4:16:47<5:37:39, 24.89s/it] {'loss': 0.6029, 'grad_norm': 0.1406071037054062, 'learning_rate': 6.269711522525006e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 966.75, 'epoch': 0.87}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 630/1444 [4:16:47<5:37:39, 24.89s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 631/1444 [4:17:12<5:39:01, 25.02s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 632/1444 [4:17:36<5:31:20, 24.48s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 633/1444 [4:18:00<5:32:15, 24.58s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 634/1444 [4:18:26<5:34:57, 24.81s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 635/1444 [4:18:49<5:28:22, 24.35s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 636/1444 [4:19:16<5:38:35, 25.14s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 637/1444 [4:19:39<5:30:11, 24.55s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 638/1444 [4:20:05<5:33:46, 24.85s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 639/1444 [4:20:28<5:27:57, 24.44s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 640/1444 [4:20:52<5:23:30, 24.14s/it] {'loss': 0.6103, 'grad_norm': 0.1416017860174179, 'learning_rate': 6.160957098573119e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 996.12, 'epoch': 0.89}
44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 640/1444 [4:20:52<5:23:30, 24.14s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 641/1444 [4:21:18<5:30:39, 24.71s/it] 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 642/1444 [4:21:43<5:30:39, 24.74s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 643/1444 [4:22:07<5:28:33, 24.61s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 644/1444 [4:22:35<5:41:46, 25.63s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 645/1444 [4:23:01<5:41:18, 25.63s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 646/1444 [4:23:27<5:43:02, 25.79s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 647/1444 [4:23:54<5:49:07, 26.28s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 648/1444 [4:24:21<5:51:07, 26.47s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 649/1444 [4:24:46<5:45:06, 26.05s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 650/1444 [4:25:09<5:30:45, 24.99s/it] {'loss': 0.6015, 'grad_norm': 0.14168281853199005, 'learning_rate': 6.05161893194083e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1057.0, 'epoch': 0.9}
45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 650/1444 [4:25:09<5:30:45, 24.99s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 651/1444 [4:25:34<5:33:24, 25.23s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 652/1444 [4:25:59<5:29:12, 24.94s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 653/1444 [4:26:25<5:36:00, 25.49s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 654/1444 [4:26:50<5:33:35, 25.34s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 655/1444 [4:27:13<5:22:11, 24.50s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 656/1444 [4:27:40<5:32:07, 25.29s/it] 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 657/1444 [4:28:06<5:34:16, 25.48s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 658/1444 [4:28:31<5:31:13, 25.28s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 659/1444 [4:28:53<5:19:14, 24.40s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 660/1444 [4:29:18<5:19:06, 24.42s/it] {'loss': 0.5851, 'grad_norm': 0.14854960143566132, 'learning_rate': 5.941751999125149e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 992.44, 'epoch': 0.91}
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 660/1444 [4:29:18<5:19:06, 24.42s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 661/1444 [4:29:42<5:17:36, 24.34s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 662/1444 [4:30:06<5:15:50, 24.23s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 663/1444 [4:30:30<5:14:35, 24.17s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 664/1444 [4:30:57<5:24:09, 24.94s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 665/1444 [4:31:20<5:18:07, 24.50s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 666/1444 [4:31:47<5:25:40, 25.12s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 667/1444 [4:32:08<5:11:56, 24.09s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 668/1444 [4:32:31<5:06:44, 23.72s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 669/1444 [4:32:55<5:07:35, 23.81s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 670/1444 [4:33:18<5:02:13, 23.43s/it] {'loss': 0.6221, 'grad_norm': 0.13109387457370758, 'learning_rate': 5.831411542492854e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1056.52, 'epoch': 0.93}
46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 670/1444 [4:33:18<5:02:13, 23.43s/it] 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 671/1444 [4:33:41<4:59:27, 23.24s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 672/1444 [4:34:07<5:11:43, 24.23s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 673/1444 [4:34:34<5:21:36, 25.03s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 674/1444 [4:34:57<5:15:09, 24.56s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 675/1444 [4:35:22<5:16:12, 24.67s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 676/1444 [4:35:47<5:15:17, 24.63s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 677/1444 [4:36:12<5:15:43, 24.70s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 678/1444 [4:36:37<5:16:10, 24.77s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 679/1444 [4:37:00<5:10:36, 24.36s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 680/1444 [4:37:22<5:01:32, 23.68s/it] {'loss': 0.5828, 'grad_norm': 0.13833071291446686, 'learning_rate': 5.720653042503978e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1061.91, 'epoch': 0.94}
47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 680/1444 [4:37:22<5:01:32, 23.68s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 681/1444 [4:37:46<5:00:13, 23.61s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 682/1444 [4:38:09<4:57:33, 23.43s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 683/1444 [4:38:30<4:48:58, 22.78s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 684/1444 [4:38:57<5:04:23, 24.03s/it] 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 685/1444 [4:39:22<5:07:49, 24.33s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 686/1444 [4:39:45<5:01:02, 23.83s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 687/1444 [4:40:13<5:17:31, 25.17s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 688/1444 [4:40:38<5:18:57, 25.31s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 689/1444 [4:41:02<5:10:33, 24.68s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 690/1444 [4:41:25<5:03:41, 24.17s/it] {'loss': 0.5827, 'grad_norm': 0.1346118003129959, 'learning_rate': 5.6095321898156016e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1017.14, 'epoch': 0.96}
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 690/1444 [4:41:25<5:03:41, 24.17s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 691/1444 [4:41:51<5:09:49, 24.69s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 692/1444 [4:42:15<5:09:27, 24.69s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 693/1444 [4:42:40<5:09:11, 24.70s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 694/1444 [4:43:04<5:05:55, 24.47s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 695/1444 [4:43:31<5:13:34, 25.12s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 696/1444 [4:43:57<5:18:07, 25.52s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 697/1444 [4:44:21<5:11:32, 25.02s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 698/1444 [4:44:44<5:05:10, 24.55s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 699/1444 [4:45:09<5:06:30, 24.68s/it] 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 700/1444 [4:45:34<5:06:38, 24.73s/it] {'loss': 0.6744, 'grad_norm': 0.12667891383171082, 'learning_rate': 5.498104857279941e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 956.99, 'epoch': 0.97}
48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 700/1444 [4:45:34<5:06:38, 24.73s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 701/1444 [4:46:03<5:19:58, 25.84s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 702/1444 [4:46:28<5:18:48, 25.78s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 703/1444 [4:46:52<5:11:52, 25.25s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 704/1444 [4:47:19<5:16:46, 25.68s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 705/1444 [4:47:43<5:10:45, 25.23s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 706/1444 [4:48:07<5:06:52, 24.95s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 707/1444 [4:48:33<5:10:00, 25.24s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 708/1444 [4:49:00<5:15:03, 25.68s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 709/1444 [4:49:24<5:09:44, 25.29s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 710/1444 [4:49:49<5:06:55, 25.09s/it] {'loss': 0.6298, 'grad_norm': 0.13424526154994965, 'learning_rate': 5.3864270718508305e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 953.92, 'epoch': 0.98}
49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 710/1444 [4:49:49<5:06:55, 25.09s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 711/1444 [4:50:17<5:15:53, 25.86s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 712/1444 [4:50:43<5:15:49, 25.89s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 713/1444 [4:51:10<5:19:15, 26.21s/it] 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 714/1444 [4:51:31<5:00:46, 24.72s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 715/1444 [4:51:52<4:47:34, 23.67s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 716/1444 [4:52:16<4:49:39, 23.87s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 717/1444 [4:52:37<4:38:28, 22.98s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 718/1444 [4:53:00<4:36:31, 22.85s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 719/1444 [4:53:25<4:43:14, 23.44s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 720/1444 [4:53:47<4:39:56, 23.20s/it] {'loss': 0.6199, 'grad_norm': 0.12869331240653992, 'learning_rate': 5.274554986412716e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1040.0, 'epoch': 1.0}
50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 720/1444 [4:53:47<4:39:56, 23.20s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 721/1444 [4:54:15<4:55:46, 24.55s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 722/1444 [4:54:32<4:26:35, 22.15s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 723/1444 [4:54:58<4:43:28, 23.59s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 724/1444 [4:55:25<4:54:22, 24.53s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 725/1444 [4:55:47<4:43:49, 23.69s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 726/1444 [4:56:12<4:48:18, 24.09s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 727/1444 [4:56:39<4:57:55, 24.93s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 728/1444 [4:57:04<4:57:25, 24.92s/it] 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 729/1444 [4:57:28<4:55:25, 24.79s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 730/1444 [4:57:54<4:59:01, 25.13s/it] {'loss': 0.6032, 'grad_norm': 0.14841921627521515, 'learning_rate': 5.162544851546349e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 885.8, 'epoch': 1.01}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 730/1444 [4:57:54<4:59:01, 25.13s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 731/1444 [4:58:19<4:57:57, 25.07s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 732/1444 [4:58:46<5:05:34, 25.75s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 733/1444 [4:59:13<5:09:43, 26.14s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 734/1444 [4:59:37<4:59:00, 25.27s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 735/1444 [5:00:00<4:52:57, 24.79s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 736/1444 [5:00:22<4:42:55, 23.98s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 737/1444 [5:00:44<4:32:42, 23.14s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 738/1444 [5:01:10<4:42:24, 24.00s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 739/1444 [5:01:32<4:34:35, 23.37s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 740/1444 [5:01:55<4:35:54, 23.52s/it] {'loss': 0.5982, 'grad_norm': 0.15776373445987701, 'learning_rate': 5.0504529872453256e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1010.64, 'epoch': 1.02}
51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 740/1444 [5:01:55<4:35:54, 23.52s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 741/1444 [5:02:18<4:32:41, 23.27s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 742/1444 [5:02:41<4:31:17, 23.19s/it] 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 743/1444 [5:03:04<4:31:33, 23.24s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 744/1444 [5:03:33<4:49:25, 24.81s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 745/1444 [5:03:58<4:48:24, 24.76s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 746/1444 [5:04:21<4:43:51, 24.40s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 747/1444 [5:04:44<4:39:17, 24.04s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 748/1444 [5:05:12<4:52:52, 25.25s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 749/1444 [5:05:36<4:45:46, 24.67s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 750/1444 [5:05:58<4:37:57, 24.03s/it] {'loss': 0.5993, 'grad_norm': 0.17491325736045837, 'learning_rate': 4.9383357545977497e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1085.43, 'epoch': 1.04}
52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 750/1444 [5:05:58<4:37:57, 24.03s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 751/1444 [5:06:24<4:41:43, 24.39s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 752/1444 [5:06:52<4:56:33, 25.71s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 753/1444 [5:07:18<4:54:40, 25.59s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 754/1444 [5:07:40<4:43:54, 24.69s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 755/1444 [5:08:05<4:43:35, 24.70s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 756/1444 [5:08:30<4:43:01, 24.68s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 757/1444 [5:08:55<4:44:24, 24.84s/it] 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 758/1444 [5:09:17<4:34:46, 24.03s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 759/1444 [5:09:43<4:41:01, 24.61s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 760/1444 [5:10:04<4:30:20, 23.71s/it] {'loss': 0.5512, 'grad_norm': 0.15103822946548462, 'learning_rate': 4.8262495274472225e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1024.19, 'epoch': 1.05}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 760/1444 [5:10:04<4:30:20, 23.71s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 761/1444 [5:10:29<4:33:12, 24.00s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 762/1444 [5:10:52<4:28:58, 23.66s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 763/1444 [5:11:18<4:36:49, 24.39s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 764/1444 [5:11:43<4:38:37, 24.58s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 765/1444 [5:12:08<4:37:52, 24.55s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 766/1444 [5:12:34<4:44:07, 25.14s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 767/1444 [5:12:57<4:36:19, 24.49s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 768/1444 [5:13:19<4:27:46, 23.77s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 769/1444 [5:13:47<4:39:18, 24.83s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 770/1444 [5:14:07<4:25:16, 23.61s/it] {'loss': 0.5822, 'grad_norm': 0.15340133011341095, 'learning_rate': 4.7142506640474274e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1189.62, 'epoch': 1.07}
53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 770/1444 [5:14:07<4:25:16, 23.61s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 771/1444 [5:14:31<4:25:55, 23.71s/it] 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 772/1444 [5:14:56<4:27:35, 23.89s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 773/1444 [5:15:20<4:30:23, 24.18s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 774/1444 [5:15:45<4:30:55, 24.26s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 775/1444 [5:16:08<4:25:51, 23.84s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 776/1444 [5:16:29<4:17:39, 23.14s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 777/1444 [5:16:53<4:19:36, 23.35s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 778/1444 [5:17:20<4:30:41, 24.39s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 779/1444 [5:17:42<4:23:37, 23.79s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 780/1444 [5:18:05<4:20:22, 23.53s/it] {'loss': 0.5395, 'grad_norm': 0.17283514142036438, 'learning_rate': 4.602395478724539e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1009.01, 'epoch': 1.08}
54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 780/1444 [5:18:05<4:20:22, 23.53s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 781/1444 [5:18:29<4:22:15, 23.73s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 782/1444 [5:18:51<4:14:16, 23.05s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 783/1444 [5:19:15<4:17:41, 23.39s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 784/1444 [5:19:39<4:19:21, 23.58s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 785/1444 [5:20:02<4:17:09, 23.41s/it] 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 786/1444 [5:20:23<4:08:28, 22.66s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 787/1444 [5:20:49<4:20:38, 23.80s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 788/1444 [5:21:16<4:27:52, 24.50s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 789/1444 [5:21:41<4:30:34, 24.78s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 790/1444 [5:22:03<4:22:03, 24.04s/it] {'loss': 0.5358, 'grad_norm': 0.15003962814807892, 'learning_rate': 4.490740213561727e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1047.2, 'epoch': 1.09}
55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 790/1444 [5:22:03<4:22:03, 24.04s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 791/1444 [5:22:28<4:22:47, 24.15s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 792/1444 [5:22:50<4:16:17, 23.58s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 793/1444 [5:23:12<4:11:22, 23.17s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 794/1444 [5:23:38<4:20:50, 24.08s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 795/1444 [5:24:03<4:23:03, 24.32s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 796/1444 [5:24:29<4:27:03, 24.73s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 797/1444 [5:24:56<4:32:54, 25.31s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 798/1444 [5:25:21<4:31:19, 25.20s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 799/1444 [5:25:46<4:32:55, 25.39s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 800/1444 [5:26:11<4:30:11, 25.17s/it] {'loss': 0.601, 'grad_norm': 0.1629399210214615, 'learning_rate': 4.379341010119992e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 942.61, 'epoch': 1.11}
55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 800/1444 [5:26:11<4:30:11, 25.17s/it] 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 801/1444 [5:26:35<4:27:17, 24.94s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 802/1444 [5:27:02<4:30:36, 25.29s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 803/1444 [5:27:29<4:37:50, 26.01s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 804/1444 [5:27:51<4:25:19, 24.87s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 805/1444 [5:28:18<4:29:53, 25.34s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 806/1444 [5:28:40<4:19:56, 24.45s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 807/1444 [5:29:08<4:28:33, 25.30s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 808/1444 [5:29:34<4:31:15, 25.59s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 809/1444 [5:30:01<4:35:49, 26.06s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 810/1444 [5:30:25<4:27:25, 25.31s/it] {'loss': 0.5845, 'grad_norm': 0.17462220788002014, 'learning_rate': 4.268253881209532e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1010.69, 'epoch': 1.12}
56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 810/1444 [5:30:25<4:27:25, 25.31s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 811/1444 [5:30:48<4:20:58, 24.74s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 812/1444 [5:31:14<4:25:55, 25.25s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 813/1444 [5:31:39<4:22:03, 24.92s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 814/1444 [5:32:02<4:15:50, 24.37s/it] 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 815/1444 [5:32:27<4:20:06, 24.81s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 816/1444 [5:32:49<4:10:46, 23.96s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 817/1444 [5:33:11<4:02:10, 23.17s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 818/1444 [5:33:34<4:02:36, 23.25s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 819/1444 [5:33:57<4:01:11, 23.15s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 820/1444 [5:34:23<4:10:05, 24.05s/it] {'loss': 0.5637, 'grad_norm': 0.17755432426929474, 'learning_rate': 4.157534682725856e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 911.61, 'epoch': 1.14}
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 820/1444 [5:34:23<4:10:05, 24.05s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 821/1444 [5:34:44<3:58:02, 22.93s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 822/1444 [5:35:08<4:01:17, 23.28s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 823/1444 [5:35:32<4:03:24, 23.52s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 824/1444 [5:35:58<4:12:43, 24.46s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 825/1444 [5:36:23<4:13:10, 24.54s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 826/1444 [5:36:46<4:07:12, 24.00s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 827/1444 [5:37:09<4:04:15, 23.75s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 828/1444 [5:37:34<4:06:59, 24.06s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 829/1444 [5:38:01<4:17:16, 25.10s/it] 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 830/1444 [5:38:26<4:16:56, 25.11s/it] {'loss': 0.5921, 'grad_norm': 0.1729191541671753, 'learning_rate': 4.047239085564794e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 914.41, 'epoch': 1.15}
57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 830/1444 [5:38:26<4:16:56, 25.11s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 831/1444 [5:38:49<4:07:25, 24.22s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 832/1444 [5:39:12<4:03:12, 23.84s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 833/1444 [5:39:33<3:54:58, 23.07s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 834/1444 [5:39:57<3:57:16, 23.34s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 835/1444 [5:40:22<4:02:00, 23.84s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 836/1444 [5:40:45<3:59:34, 23.64s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 837/1444 [5:41:08<3:55:49, 23.31s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 838/1444 [5:41:32<4:00:02, 23.77s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 839/1444 [5:41:59<4:08:26, 24.64s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 840/1444 [5:42:24<4:08:38, 24.70s/it] {'loss': 0.6086, 'grad_norm': 0.15745393931865692, 'learning_rate': 3.937422547630519e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1000.17, 'epoch': 1.16}
58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 840/1444 [5:42:24<4:08:38, 24.70s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 841/1444 [5:42:50<4:11:06, 24.99s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 842/1444 [5:43:13<4:06:14, 24.54s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 843/1444 [5:43:41<4:14:57, 25.45s/it] 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 844/1444 [5:44:05<4:11:28, 25.15s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 845/1444 [5:44:27<4:02:19, 24.27s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 846/1444 [5:44:52<4:02:54, 24.37s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 847/1444 [5:45:18<4:06:47, 24.80s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 848/1444 [5:45:43<4:07:33, 24.92s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 849/1444 [5:46:06<4:01:14, 24.33s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 850/1444 [5:46:30<3:59:51, 24.23s/it] {'loss': 0.5603, 'grad_norm': 0.16624517738819122, 'learning_rate': 3.828140285950676e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 991.45, 'epoch': 1.18}
59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 850/1444 [5:46:30<3:59:51, 24.23s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 851/1444 [5:46:55<4:03:34, 24.65s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 852/1444 [5:47:20<4:02:34, 24.59s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 853/1444 [5:47:44<3:59:44, 24.34s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 854/1444 [5:48:12<4:11:19, 25.56s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 855/1444 [5:48:35<4:04:36, 24.92s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 856/1444 [5:49:02<4:08:19, 25.34s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 857/1444 [5:49:27<4:07:18, 25.28s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 858/1444 [5:49:52<4:06:23, 25.23s/it] 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 859/1444 [5:50:14<3:55:13, 24.13s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 860/1444 [5:50:37<3:53:58, 24.04s/it] {'loss': 0.5715, 'grad_norm': 0.17486163973808289, 'learning_rate': 3.7194472489126176e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1005.74, 'epoch': 1.19}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 860/1444 [5:50:37<3:53:58, 24.04s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 861/1444 [5:50:58<3:42:57, 22.95s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 862/1444 [5:51:21<3:42:21, 22.92s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 863/1444 [5:51:47<3:50:51, 23.84s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 864/1444 [5:52:11<3:51:31, 23.95s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 865/1444 [5:52:38<4:01:08, 24.99s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 866/1444 [5:52:59<3:49:29, 23.82s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 867/1444 [5:53:23<3:49:15, 23.84s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 868/1444 [5:53:45<3:43:00, 23.23s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 869/1444 [5:54:07<3:38:21, 22.78s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 870/1444 [5:54:31<3:40:47, 23.08s/it] {'loss': 0.5577, 'grad_norm': 0.18270528316497803, 'learning_rate': 3.611398088634721e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 965.77, 'epoch': 1.21}
60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 870/1444 [5:54:31<3:40:47, 23.08s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 871/1444 [5:54:58<3:52:17, 24.32s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 872/1444 [5:55:20<3:45:59, 23.70s/it] 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 873/1444 [5:55:44<3:45:59, 23.75s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 874/1444 [5:56:10<3:51:52, 24.41s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 875/1444 [5:56:36<3:56:05, 24.90s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 876/1444 [5:56:57<3:44:41, 23.73s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 877/1444 [5:57:22<3:47:00, 24.02s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 878/1444 [5:57:49<3:55:40, 24.98s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 879/1444 [5:58:11<3:47:51, 24.20s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 880/1444 [5:58:38<3:53:58, 24.89s/it] {'loss': 0.5706, 'grad_norm': 0.1544029712677002, 'learning_rate': 3.5040471334866695e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 885.82, 'epoch': 1.22}
61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 880/1444 [5:58:38<3:53:58, 24.89s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 881/1444 [5:59:00<3:46:08, 24.10s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 882/1444 [5:59:22<3:39:55, 23.48s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 883/1444 [5:59:48<3:46:15, 24.20s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 884/1444 [6:00:11<3:43:28, 23.94s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 885/1444 [6:00:37<3:48:05, 24.48s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 886/1444 [6:01:00<3:44:42, 24.16s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 887/1444 [6:01:25<3:45:19, 24.27s/it] 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 888/1444 [6:01:50<3:45:48, 24.37s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 889/1444 [6:02:17<3:54:45, 25.38s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 890/1444 [6:02:44<3:58:15, 25.80s/it] {'loss': 0.5791, 'grad_norm': 0.1610870063304901, 'learning_rate': 3.397448360772516e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 903.16, 'epoch': 1.23}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 890/1444 [6:02:44<3:58:15, 25.80s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 891/1444 [6:03:06<3:46:17, 24.55s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 892/1444 [6:03:27<3:37:23, 23.63s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 893/1444 [6:03:54<3:45:09, 24.52s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 894/1444 [6:04:18<3:43:04, 24.34s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 895/1444 [6:04:41<3:40:21, 24.08s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 896/1444 [6:05:05<3:38:35, 23.93s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 897/1444 [6:05:29<3:38:53, 24.01s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 898/1444 [6:05:54<3:41:38, 24.36s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 899/1444 [6:06:17<3:37:08, 23.91s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 900/1444 [6:06:41<3:38:01, 24.05s/it] {'loss': 0.5978, 'grad_norm': 0.15956935286521912, 'learning_rate': 3.291655369590269e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 954.24, 'epoch': 1.25}
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 900/1444 [6:06:41<3:38:01, 24.05s/it][2025-12-27 14:38:48,227] [INFO] [axolotl.core.trainers.base._save:671] [PID:8935] Saving model checkpoint to ./outputs/qwen32b-thai/checkpoint-900
62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 901/1444 [6:07:10<3:50:17, 25.45s/it] 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 902/1444 [6:07:38<3:57:49, 26.33s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 903/1444 [6:08:06<4:00:25, 26.66s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 904/1444 [6:08:29<3:50:35, 25.62s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 905/1444 [6:08:52<3:43:56, 24.93s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 906/1444 [6:09:17<3:43:31, 24.93s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 907/1444 [6:09:43<3:45:01, 25.14s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 908/1444 [6:10:09<3:46:09, 25.32s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 909/1444 [6:10:31<3:38:07, 24.46s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 910/1444 [6:10:54<3:33:49, 24.03s/it] {'loss': 0.5702, 'grad_norm': 0.17166025936603546, 'learning_rate': 3.186721353881648e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1044.16, 'epoch': 1.26}
63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 910/1444 [6:10:54<3:33:49, 24.03s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 911/1444 [6:11:18<3:31:42, 23.83s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 912/1444 [6:11:46<3:42:46, 25.12s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 913/1444 [6:12:10<3:39:40, 24.82s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 914/1444 [6:12:35<3:39:55, 24.90s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 915/1444 [6:13:04<3:51:49, 26.29s/it] 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 916/1444 [6:13:25<3:37:02, 24.66s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 917/1444 [6:13:49<3:34:58, 24.48s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 918/1444 [6:14:13<3:32:23, 24.23s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 919/1444 [6:14:36<3:28:27, 23.82s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 920/1444 [6:15:02<3:34:23, 24.55s/it] {'loss': 0.5839, 'grad_norm': 0.1581714153289795, 'learning_rate': 3.082699075685553e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 924.88, 'epoch': 1.27}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 920/1444 [6:15:02<3:34:23, 24.55s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 921/1444 [6:15:26<3:32:05, 24.33s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 922/1444 [6:15:50<3:30:53, 24.24s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 923/1444 [6:16:12<3:25:12, 23.63s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 924/1444 [6:16:37<3:29:03, 24.12s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 925/1444 [6:17:03<3:32:15, 24.54s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 926/1444 [6:17:28<3:34:18, 24.82s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 927/1444 [6:17:55<3:39:03, 25.42s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 928/1444 [6:18:18<3:31:29, 24.59s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 929/1444 [6:18:46<3:41:10, 25.77s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 930/1444 [6:19:12<3:41:09, 25.82s/it] {'loss': 0.5687, 'grad_norm': 0.17262092232704163, 'learning_rate': 2.9796408386086962e-05, 'memory/max_active (GiB)': 84.97, 'memory/max_allocated (GiB)': 84.97, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 896.89, 'epoch': 1.29}
64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 930/1444 [6:19:12<3:41:09, 25.82s/it] 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 931/1444 [6:19:40<3:45:38, 26.39s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 932/1444 [6:20:04<3:37:56, 25.54s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 933/1444 [6:20:28<3:34:02, 25.13s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 934/1444 [6:20:52<3:30:32, 24.77s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 935/1444 [6:21:19<3:37:21, 25.62s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 936/1444 [6:21:43<3:31:26, 24.97s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 937/1444 [6:22:05<3:25:04, 24.27s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 938/1444 [6:22:30<3:25:48, 24.40s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 939/1444 [6:22:57<3:31:35, 25.14s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 940/1444 [6:23:25<3:38:21, 26.00s/it] {'loss': 0.5764, 'grad_norm': 0.2113182544708252, 'learning_rate': 2.8775984615267504e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 871.53, 'epoch': 1.3}
65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 940/1444 [6:23:25<3:38:21, 26.00s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 941/1444 [6:23:48<3:31:04, 25.18s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 942/1444 [6:24:13<3:30:35, 25.17s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 943/1444 [6:24:40<3:34:22, 25.67s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 944/1444 [6:25:05<3:30:57, 25.32s/it] 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 945/1444 [6:25:30<3:29:50, 25.23s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 946/1444 [6:25:56<3:31:17, 25.46s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 947/1444 [6:26:17<3:21:02, 24.27s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 948/1444 [6:26:40<3:16:55, 23.82s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 949/1444 [6:27:07<3:23:43, 24.69s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 950/1444 [6:27:36<3:33:45, 25.96s/it] {'loss': 0.5451, 'grad_norm': 0.16499237716197968, 'learning_rate': 2.7766232525292103e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 846.63, 'epoch': 1.32}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 950/1444 [6:27:36<3:33:45, 25.96s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 951/1444 [6:28:00<3:28:43, 25.40s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 952/1444 [6:28:25<3:28:22, 25.41s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 953/1444 [6:28:52<3:30:57, 25.78s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 954/1444 [6:29:19<3:33:51, 26.19s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 955/1444 [6:29:45<3:32:09, 26.03s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 956/1444 [6:30:10<3:29:33, 25.77s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 957/1444 [6:30:37<3:32:26, 26.17s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 958/1444 [6:31:04<3:33:09, 26.32s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 959/1444 [6:31:25<3:20:44, 24.83s/it] 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 960/1444 [6:31:49<3:19:17, 24.70s/it] {'loss': 0.5698, 'grad_norm': 0.16787481307983398, 'learning_rate': 2.676765983121089e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 996.59, 'epoch': 1.33}
66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 960/1444 [6:31:49<3:19:17, 24.70s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 961/1444 [6:32:09<3:05:26, 23.04s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 962/1444 [6:32:31<3:04:02, 22.91s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 963/1444 [6:32:52<2:59:50, 22.43s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 964/1444 [6:33:16<3:01:12, 22.65s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 965/1444 [6:33:41<3:07:12, 23.45s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 966/1444 [6:34:05<3:09:14, 23.75s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 967/1444 [6:34:31<3:13:41, 24.36s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 968/1444 [6:35:01<3:26:46, 26.06s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 969/1444 [6:35:25<3:21:35, 25.46s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 970/1444 [6:35:49<3:17:21, 24.98s/it] {'loss': 0.5677, 'grad_norm': 0.17758306860923767, 'learning_rate': 2.578076862694426e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 990.05, 'epoch': 1.34}
67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 970/1444 [6:35:49<3:17:21, 24.98s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 971/1444 [6:36:12<3:12:50, 24.46s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 972/1444 [6:36:39<3:17:47, 25.14s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 973/1444 [6:37:06<3:20:26, 25.53s/it] 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 974/1444 [6:37:28<3:13:43, 24.73s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 975/1444 [6:37:55<3:17:33, 25.27s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 976/1444 [6:38:21<3:19:12, 25.54s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 977/1444 [6:38:48<3:22:51, 26.06s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 978/1444 [6:39:10<3:13:05, 24.86s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 979/1444 [6:39:36<3:13:39, 24.99s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 980/1444 [6:40:00<3:11:22, 24.75s/it] {'loss': 0.5832, 'grad_norm': 0.1822880357503891, 'learning_rate': 2.4806055132824185e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 941.98, 'epoch': 1.36}
68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 980/1444 [6:40:00<3:11:22, 24.75s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 981/1444 [6:40:21<3:03:33, 23.79s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 982/1444 [6:40:42<2:55:32, 22.80s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 983/1444 [6:41:09<3:04:31, 24.02s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 984/1444 [6:41:32<3:03:17, 23.91s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 985/1444 [6:41:57<3:03:14, 23.95s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 986/1444 [6:42:22<3:05:26, 24.29s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 987/1444 [6:42:45<3:02:04, 23.91s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 988/1444 [6:43:05<2:54:37, 22.98s/it] 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 989/1444 [6:43:28<2:52:33, 22.75s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 990/1444 [6:43:53<2:57:14, 23.42s/it] {'loss': 0.5803, 'grad_norm': 0.19017541408538818, 'learning_rate': 2.384400944608886e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 942.86, 'epoch': 1.37}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 990/1444 [6:43:53<2:57:14, 23.42s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 991/1444 [6:44:15<2:55:32, 23.25s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 992/1444 [6:44:39<2:56:10, 23.39s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 993/1444 [6:45:02<2:54:36, 23.23s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 994/1444 [6:45:29<3:02:45, 24.37s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 995/1444 [6:45:53<3:00:27, 24.11s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 996/1444 [6:46:15<2:57:16, 23.74s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 997/1444 [6:46:38<2:54:09, 23.38s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 998/1444 [6:47:04<2:58:53, 24.07s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 999/1444 [6:47:29<3:01:37, 24.49s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1000/1444 [6:47:56<3:05:28, 25.06s/it] {'loss': 0.5507, 'grad_norm': 0.19104644656181335, 'learning_rate': 2.289511529445616e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 905.35, 'epoch': 1.39}
69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1000/1444 [6:47:56<3:05:28, 25.06s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1001/1444 [6:48:20<3:04:05, 24.93s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1002/1444 [6:48:44<3:00:19, 24.48s/it] 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1003/1444 [6:49:09<3:01:31, 24.70s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1004/1444 [6:49:36<3:06:14, 25.40s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1005/1444 [6:50:05<3:13:08, 26.40s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1006/1444 [6:50:28<3:06:27, 25.54s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1007/1444 [6:50:52<3:03:08, 25.15s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1008/1444 [6:51:15<2:57:56, 24.49s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1009/1444 [6:51:38<2:52:45, 23.83s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1010/1444 [6:52:03<2:55:38, 24.28s/it] {'loss': 0.5855, 'grad_norm': 0.16881509125232697, 'learning_rate': 2.195984979289974e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 963.29, 'epoch': 1.4}
70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1010/1444 [6:52:03<2:55:38, 24.28s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1011/1444 [6:52:27<2:55:19, 24.30s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1012/1444 [6:52:52<2:54:54, 24.29s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1013/1444 [6:53:13<2:49:28, 23.59s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1014/1444 [6:53:41<2:56:46, 24.67s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1015/1444 [6:54:02<2:49:44, 23.74s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1016/1444 [6:54:26<2:49:06, 23.71s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1017/1444 [6:54:47<2:43:30, 22.97s/it] 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1018/1444 [6:55:14<2:51:50, 24.20s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1019/1444 [6:55:39<2:52:57, 24.42s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1020/1444 [6:56:06<2:57:14, 25.08s/it] {'loss': 0.593, 'grad_norm': 0.17394502460956573, 'learning_rate': 2.1038683203750092e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 874.44, 'epoch': 1.41}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1020/1444 [6:56:06<2:57:14, 25.08s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1021/1444 [6:56:32<2:59:37, 25.48s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1022/1444 [6:56:55<2:54:20, 24.79s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1023/1444 [6:57:20<2:53:04, 24.67s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1024/1444 [6:57:43<2:49:05, 24.15s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1025/1444 [6:58:06<2:47:36, 24.00s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1026/1444 [6:58:29<2:45:24, 23.74s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1027/1444 [6:58:49<2:36:50, 22.57s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1028/1444 [6:59:15<2:42:31, 23.44s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1029/1444 [6:59:40<2:46:46, 24.11s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1030/1444 [7:00:06<2:50:09, 24.66s/it] {'loss': 0.5932, 'grad_norm': 0.17141559720039368, 'learning_rate': 2.0132078700241158e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 898.09, 'epoch': 1.43}
71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1030/1444 [7:00:06<2:50:09, 24.66s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1031/1444 [7:00:33<2:53:13, 25.17s/it] 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1032/1444 [7:01:01<2:59:59, 26.21s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1033/1444 [7:01:24<2:52:26, 25.17s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1034/1444 [7:01:48<2:49:13, 24.76s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1035/1444 [7:02:12<2:47:27, 24.57s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1036/1444 [7:02:33<2:39:20, 23.43s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1037/1444 [7:03:02<2:51:37, 25.30s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1038/1444 [7:03:28<2:50:54, 25.26s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1039/1444 [7:03:49<2:43:11, 24.18s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1040/1444 [7:04:20<2:56:10, 26.17s/it] {'loss': 0.5972, 'grad_norm': 0.17774531245231628, 'learning_rate': 1.924049213362153e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 780.07, 'epoch': 1.44}
72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1040/1444 [7:04:20<2:56:10, 26.17s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1041/1444 [7:04:45<2:53:11, 25.79s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1042/1444 [7:05:09<2:48:15, 25.11s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1043/1444 [7:05:31<2:42:36, 24.33s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1044/1444 [7:05:55<2:41:37, 24.24s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1045/1444 [7:06:21<2:44:36, 24.75s/it] 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1046/1444 [7:06:45<2:41:46, 24.39s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1047/1444 [7:07:10<2:43:30, 24.71s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1048/1444 [7:07:32<2:38:23, 24.00s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1049/1444 [7:08:00<2:44:16, 24.95s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1050/1444 [7:08:20<2:34:32, 23.53s/it] {'loss': 0.5828, 'grad_norm': 0.18897521495819092, 'learning_rate': 1.836437180394715e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1176.61, 'epoch': 1.45}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1050/1444 [7:08:20<2:34:32, 23.53s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1051/1444 [7:08:43<2:34:28, 23.58s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1052/1444 [7:09:12<2:43:35, 25.04s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1053/1444 [7:09:37<2:42:25, 24.92s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1054/1444 [7:09:59<2:37:55, 24.30s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1055/1444 [7:10:22<2:34:20, 23.81s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1056/1444 [7:10:49<2:39:40, 24.69s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1057/1444 [7:11:15<2:41:40, 25.07s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1058/1444 [7:11:38<2:36:56, 24.39s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1059/1444 [7:12:03<2:38:47, 24.75s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1060/1444 [7:12:25<2:33:03, 23.91s/it] {'loss': 0.5548, 'grad_norm': 0.1707300990819931, 'learning_rate': 1.750415823467082e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1087.95, 'epoch': 1.47}
73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1060/1444 [7:12:25<2:33:03, 23.91s/it] 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1061/1444 [7:12:50<2:34:57, 24.28s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1062/1444 [7:13:13<2:32:02, 23.88s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1063/1444 [7:13:36<2:30:25, 23.69s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1064/1444 [7:13:59<2:28:33, 23.46s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1065/1444 [7:14:22<2:27:19, 23.32s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1066/1444 [7:14:43<2:21:19, 22.43s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1067/1444 [7:15:07<2:24:39, 23.02s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1068/1444 [7:15:35<2:32:34, 24.35s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1069/1444 [7:15:57<2:28:46, 23.80s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1070/1444 [7:16:25<2:36:39, 25.13s/it] {'loss': 0.5622, 'grad_norm': 0.1707785278558731, 'learning_rate': 1.666028395114185e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 840.77, 'epoch': 1.48}
74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1070/1444 [7:16:25<2:36:39, 25.13s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1071/1444 [7:16:48<2:31:13, 24.33s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1072/1444 [7:17:12<2:30:03, 24.20s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1073/1444 [7:17:37<2:30:58, 24.42s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1074/1444 [7:17:59<2:27:39, 23.95s/it] 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1075/1444 [7:18:22<2:25:04, 23.59s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1076/1444 [7:18:49<2:30:06, 24.47s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1077/1444 [7:19:17<2:36:35, 25.60s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1078/1444 [7:19:41<2:33:34, 25.18s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1079/1444 [7:20:08<2:35:20, 25.53s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1080/1444 [7:20:31<2:31:14, 24.93s/it] {'loss': 0.5373, 'grad_norm': 0.1891181617975235, 'learning_rate': 1.5833173263127426e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1017.51, 'epoch': 1.5}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1080/1444 [7:20:31<2:31:14, 24.93s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1081/1444 [7:20:57<2:31:49, 25.09s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1082/1444 [7:21:18<2:25:32, 24.12s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1083/1444 [7:21:45<2:29:32, 24.85s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1084/1444 [7:22:08<2:26:45, 24.46s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1085/1444 [7:22:34<2:28:48, 24.87s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1086/1444 [7:22:56<2:22:55, 23.96s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1087/1444 [7:23:20<2:22:24, 23.94s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1088/1444 [7:23:40<2:14:25, 22.66s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1089/1444 [7:24:04<2:17:05, 23.17s/it] 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1090/1444 [7:24:28<2:17:50, 23.36s/it] {'loss': 0.5578, 'grad_norm': 0.17579694092273712, 'learning_rate': 1.5023242051464675e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 976.53, 'epoch': 1.51}
75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1090/1444 [7:24:28<2:17:50, 23.36s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1091/1444 [7:24:52<2:18:43, 23.58s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1092/1444 [7:25:14<2:16:28, 23.26s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1093/1444 [7:25:39<2:18:20, 23.65s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1094/1444 [7:26:03<2:19:04, 23.84s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1095/1444 [7:26:29<2:22:33, 24.51s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1096/1444 [7:26:51<2:17:17, 23.67s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1097/1444 [7:27:12<2:12:30, 22.91s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1098/1444 [7:27:36<2:13:02, 23.07s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1099/1444 [7:27:59<2:13:53, 23.29s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1100/1444 [7:28:23<2:14:41, 23.49s/it] {'loss': 0.5754, 'grad_norm': 0.16882087290287018, 'learning_rate': 1.4230897558950951e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1004.78, 'epoch': 1.52}
76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1100/1444 [7:28:23<2:14:41, 23.49s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1101/1444 [7:28:50<2:20:17, 24.54s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1102/1444 [7:29:15<2:20:25, 24.64s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1103/1444 [7:29:40<2:20:35, 24.74s/it] 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1104/1444 [7:30:06<2:22:24, 25.13s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1105/1444 [7:30:35<2:27:31, 26.11s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1106/1444 [7:31:00<2:25:50, 25.89s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1107/1444 [7:31:25<2:23:35, 25.57s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1108/1444 [7:31:47<2:17:28, 24.55s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1109/1444 [7:32:14<2:21:40, 25.37s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1110/1444 [7:32:34<2:12:16, 23.76s/it] {'loss': 0.5399, 'grad_norm': 0.1948268711566925, 'learning_rate': 1.3456538185577466e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1117.6, 'epoch': 1.54}
77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1110/1444 [7:32:34<2:12:16, 23.76s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1111/1444 [7:33:01<2:16:10, 24.54s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1112/1444 [7:33:25<2:14:37, 24.33s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1113/1444 [7:33:48<2:12:10, 23.96s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1114/1444 [7:34:12<2:12:00, 24.00s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1115/1444 [7:34:32<2:05:53, 22.96s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1116/1444 [7:34:59<2:11:45, 24.10s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1117/1444 [7:35:22<2:10:10, 23.89s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1118/1444 [7:35:45<2:08:17, 23.61s/it] 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1119/1444 [7:36:08<2:06:23, 23.33s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1120/1444 [7:36:35<2:11:53, 24.42s/it] {'loss': 0.5915, 'grad_norm': 0.19728456437587738, 'learning_rate': 1.2700553288209126e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 882.19, 'epoch': 1.55}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1120/1444 [7:36:35<2:11:53, 24.42s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1121/1444 [7:37:02<2:16:00, 25.26s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1122/1444 [7:37:24<2:09:25, 24.12s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1123/1444 [7:37:47<2:07:53, 23.91s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1124/1444 [7:38:09<2:03:29, 23.15s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1125/1444 [7:38:32<2:03:20, 23.20s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1126/1444 [7:38:55<2:02:52, 23.18s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1127/1444 [7:39:19<2:03:36, 23.40s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1128/1444 [7:39:48<2:13:00, 25.26s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1129/1444 [7:40:11<2:08:19, 24.44s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1130/1444 [7:40:32<2:02:52, 23.48s/it] {'loss': 0.5671, 'grad_norm': 0.17635080218315125, 'learning_rate': 1.1963322984811453e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1119.95, 'epoch': 1.57}
78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1130/1444 [7:40:32<2:02:52, 23.48s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1131/1444 [7:40:57<2:04:23, 23.84s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1132/1444 [7:41:22<2:05:31, 24.14s/it] 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1133/1444 [7:41:50<2:10:47, 25.23s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1134/1444 [7:42:14<2:09:05, 24.98s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1135/1444 [7:42:38<2:07:12, 24.70s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1136/1444 [7:43:04<2:09:19, 25.19s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1137/1444 [7:43:30<2:09:54, 25.39s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1138/1444 [7:43:55<2:09:08, 25.32s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1139/1444 [7:44:20<2:07:17, 25.04s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1140/1444 [7:44:50<2:14:32, 26.55s/it] {'loss': 0.5755, 'grad_norm': 0.17266033589839935, 'learning_rate': 1.1245217963322763e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 788.89, 'epoch': 1.58}
79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1140/1444 [7:44:50<2:14:32, 26.55s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1141/1444 [7:45:15<2:11:38, 26.07s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1142/1444 [7:45:39<2:07:50, 25.40s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1143/1444 [7:46:01<2:02:31, 24.42s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1144/1444 [7:46:28<2:06:12, 25.24s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1145/1444 [7:46:52<2:03:34, 24.80s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1146/1444 [7:47:15<2:00:17, 24.22s/it] 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1147/1444 [7:47:41<2:03:19, 24.92s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1148/1444 [7:48:05<2:00:47, 24.48s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1149/1444 [7:48:34<2:08:16, 26.09s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1150/1444 [7:48:59<2:05:40, 25.65s/it] {'loss': 0.5328, 'grad_norm': 0.17995904386043549, 'learning_rate': 1.0546599295268061e-05, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 948.5, 'epoch': 1.59}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1150/1444 [7:48:59<2:05:40, 25.65s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1151/1444 [7:49:21<2:00:13, 24.62s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1152/1444 [7:49:47<2:01:12, 24.90s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1153/1444 [7:50:12<2:01:09, 24.98s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1154/1444 [7:50:39<2:03:29, 25.55s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1155/1444 [7:51:04<2:03:01, 25.54s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1156/1444 [7:51:31<2:03:52, 25.81s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1157/1444 [7:51:57<2:03:21, 25.79s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1158/1444 [7:52:22<2:02:38, 25.73s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1159/1444 [7:52:46<1:58:57, 25.04s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1160/1444 [7:53:10<1:57:58, 24.92s/it] {'loss': 0.5633, 'grad_norm': 0.17355145514011383, 'learning_rate': 9.867818254208123e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 965.8, 'epoch': 1.61}
80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1160/1444 [7:53:10<1:57:58, 24.92s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1161/1444 [7:53:32<1:52:30, 23.85s/it] 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1162/1444 [7:53:56<1:53:31, 24.15s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1163/1444 [7:54:21<1:53:56, 24.33s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1164/1444 [7:54:45<1:53:24, 24.30s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1165/1444 [7:55:13<1:57:07, 25.19s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1166/1444 [7:55:35<1:52:43, 24.33s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1167/1444 [7:56:03<1:57:00, 25.35s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1168/1444 [7:56:29<1:57:57, 25.64s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1169/1444 [7:56:55<1:57:46, 25.70s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1170/1444 [7:57:21<1:58:00, 25.84s/it] {'loss': 0.5425, 'grad_norm': 0.18094998598098755, 'learning_rate': 9.209216139114935e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 914.6, 'epoch': 1.62}
81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1170/1444 [7:57:21<1:58:00, 25.84s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1171/1444 [7:57:45<1:54:59, 25.27s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1172/1444 [7:58:12<1:56:45, 25.76s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1173/1444 [7:58:35<1:52:16, 24.86s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1174/1444 [7:58:59<1:51:30, 24.78s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1175/1444 [7:59:25<1:52:56, 25.19s/it] 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1176/1444 [7:59:48<1:48:42, 24.34s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1177/1444 [8:00:14<1:51:31, 25.06s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1178/1444 [8:00:41<1:52:54, 25.47s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1179/1444 [8:01:09<1:55:36, 26.17s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1180/1444 [8:01:34<1:53:45, 25.85s/it] {'loss': 0.577, 'grad_norm': 0.19211266934871674, 'learning_rate': 8.571124102762768e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 987.37, 'epoch': 1.63}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1180/1444 [8:01:34<1:53:45, 25.85s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1181/1444 [8:01:57<1:49:36, 25.00s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1182/1444 [8:02:24<1:52:25, 25.75s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1183/1444 [8:02:50<1:52:00, 25.75s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1184/1444 [8:03:12<1:46:10, 24.50s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1185/1444 [8:03:34<1:42:48, 23.82s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1186/1444 [8:03:58<1:42:45, 23.90s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1187/1444 [8:04:25<1:46:54, 24.96s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1188/1444 [8:04:52<1:48:41, 25.48s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1189/1444 [8:05:15<1:45:09, 24.74s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1190/1444 [8:05:42<1:47:52, 25.48s/it] {'loss': 0.5719, 'grad_norm': 0.1781914234161377, 'learning_rate': 7.95386298522065e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 864.39, 'epoch': 1.65}
82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1190/1444 [8:05:42<1:47:52, 25.48s/it] 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1191/1444 [8:06:02<1:40:35, 23.86s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1192/1444 [8:06:29<1:43:54, 24.74s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1193/1444 [8:06:54<1:43:57, 24.85s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1194/1444 [8:07:19<1:43:16, 24.79s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1195/1444 [8:07:44<1:43:19, 24.90s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1196/1444 [8:08:10<1:44:02, 25.17s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1197/1444 [8:08:37<1:46:11, 25.80s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1198/1444 [8:09:03<1:45:56, 25.84s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1199/1444 [8:09:29<1:45:13, 25.77s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1200/1444 [8:09:54<1:43:53, 25.55s/it] {'loss': 0.5647, 'grad_norm': 0.17774876952171326, 'learning_rate': 7.357743152530272e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 954.56, 'epoch': 1.66}
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1200/1444 [8:09:54<1:43:53, 25.55s/it][2025-12-27 16:42:00,609] [INFO] [axolotl.core.trainers.base._save:671] [PID:8935] Saving model checkpoint to ./outputs/qwen32b-thai/checkpoint-1200
83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1201/1444 [8:10:27<1:52:18, 27.73s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1202/1444 [8:10:49<1:45:38, 26.19s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1203/1444 [8:11:12<1:40:45, 25.09s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1204/1444 [8:11:36<1:39:18, 24.83s/it] 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1205/1444 [8:11:58<1:36:12, 24.15s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1206/1444 [8:12:23<1:35:49, 24.16s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1207/1444 [8:12:46<1:34:49, 24.01s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1208/1444 [8:13:10<1:33:45, 23.84s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1209/1444 [8:13:32<1:32:04, 23.51s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1210/1444 [8:13:54<1:29:21, 22.91s/it] {'loss': 0.6007, 'grad_norm': 0.1787899136543274, 'learning_rate': 6.783064340650364e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1119.17, 'epoch': 1.68}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1210/1444 [8:13:54<1:29:21, 22.91s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1211/1444 [8:14:17<1:29:30, 23.05s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1212/1444 [8:14:42<1:30:28, 23.40s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1213/1444 [8:15:07<1:31:59, 23.89s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1214/1444 [8:15:33<1:34:40, 24.70s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1215/1444 [8:15:56<1:32:00, 24.10s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1216/1444 [8:16:18<1:29:07, 23.46s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1217/1444 [8:16:43<1:30:24, 23.90s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1218/1444 [8:17:08<1:31:12, 24.22s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1219/1444 [8:17:31<1:29:11, 23.79s/it] 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1220/1444 [8:17:57<1:32:06, 24.67s/it] {'loss': 0.5647, 'grad_norm': 0.16189730167388916, 'learning_rate': 6.230115504745953e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 889.49, 'epoch': 1.69}
84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1220/1444 [8:17:57<1:32:06, 24.67s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1221/1444 [8:18:22<1:31:33, 24.63s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1222/1444 [8:18:44<1:28:39, 23.96s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1223/1444 [8:19:08<1:28:01, 23.90s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1224/1444 [8:19:32<1:28:06, 24.03s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1225/1444 [8:19:58<1:29:17, 24.46s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1226/1444 [8:20:23<1:30:06, 24.80s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1227/1444 [8:20:49<1:30:29, 25.02s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1228/1444 [8:21:14<1:30:14, 25.07s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1229/1444 [8:21:39<1:29:33, 24.99s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1230/1444 [8:22:04<1:29:11, 25.01s/it] {'loss': 0.5239, 'grad_norm': 0.17614033818244934, 'learning_rate': 5.699174673898394e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 925.55, 'epoch': 1.7}
85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1230/1444 [8:22:04<1:29:11, 25.01s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1231/1444 [8:22:29<1:28:21, 24.89s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1232/1444 [8:22:54<1:28:00, 24.91s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1233/1444 [8:23:20<1:28:45, 25.24s/it] 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1234/1444 [8:23:48<1:31:58, 26.28s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1235/1444 [8:24:13<1:30:24, 25.95s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1236/1444 [8:24:41<1:31:19, 26.34s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1237/1444 [8:25:07<1:31:09, 26.42s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1238/1444 [8:25:31<1:27:51, 25.59s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1239/1444 [8:25:53<1:23:51, 24.54s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1240/1444 [8:26:16<1:22:17, 24.20s/it] {'loss': 0.5803, 'grad_norm': 0.18107430636882782, 'learning_rate': 5.190508811309091e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1020.96, 'epoch': 1.72}
86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1240/1444 [8:26:16<1:22:17, 24.20s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1241/1444 [8:26:40<1:21:17, 24.03s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1242/1444 [8:27:07<1:24:19, 25.05s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1243/1444 [8:27:32<1:22:54, 24.75s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1244/1444 [8:27:56<1:22:24, 24.72s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1245/1444 [8:28:21<1:22:18, 24.82s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1246/1444 [8:28:45<1:21:07, 24.58s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1247/1444 [8:29:10<1:21:15, 24.75s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1248/1444 [8:29:33<1:18:54, 24.16s/it] 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1249/1444 [8:29:59<1:19:45, 24.54s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1250/1444 [8:30:23<1:19:22, 24.55s/it] {'loss': 0.5542, 'grad_norm': 0.18331079185009003, 'learning_rate': 4.704373680067325e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 975.2, 'epoch': 1.73}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1250/1444 [8:30:23<1:19:22, 24.55s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1251/1444 [8:30:50<1:21:26, 25.32s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1252/1444 [8:31:18<1:22:56, 25.92s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1253/1444 [8:31:43<1:22:24, 25.89s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1254/1444 [8:32:11<1:23:35, 26.40s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1255/1444 [8:32:37<1:22:20, 26.14s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1256/1444 [8:33:02<1:20:58, 25.84s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1257/1444 [8:33:26<1:19:33, 25.52s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1258/1444 [8:33:47<1:14:52, 24.16s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1259/1444 [8:34:11<1:14:08, 24.05s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1260/1444 [8:34:36<1:14:24, 24.27s/it] {'loss': 0.5879, 'grad_norm': 0.20262940227985382, 'learning_rate': 4.241013714549597e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 935.67, 'epoch': 1.75}
87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1260/1444 [8:34:36<1:14:24, 24.27s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1261/1444 [8:35:03<1:16:34, 25.11s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1262/1444 [8:35:28<1:16:13, 25.13s/it] 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1263/1444 [8:35:55<1:16:51, 25.48s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1264/1444 [8:36:16<1:12:51, 24.29s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1265/1444 [8:36:43<1:14:33, 24.99s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1266/1444 [8:37:08<1:14:11, 25.01s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1267/1444 [8:37:32<1:13:20, 24.86s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1268/1444 [8:37:56<1:11:51, 24.50s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1269/1444 [8:38:20<1:10:53, 24.30s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1270/1444 [8:38:45<1:10:59, 24.48s/it] {'loss': 0.5437, 'grad_norm': 0.16563181579113007, 'learning_rate': 3.800661897515245e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 947.0, 'epoch': 1.76}
88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1270/1444 [8:38:45<1:10:59, 24.48s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1271/1444 [8:39:10<1:11:29, 24.80s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1272/1444 [8:39:37<1:12:54, 25.43s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1273/1444 [8:40:01<1:11:10, 24.98s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1274/1444 [8:40:26<1:10:37, 24.93s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1275/1444 [8:40:49<1:08:23, 24.28s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1276/1444 [8:41:15<1:09:51, 24.95s/it] 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1277/1444 [8:41:41<1:10:27, 25.31s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1278/1444 [8:42:06<1:09:43, 25.20s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1279/1444 [8:42:27<1:06:04, 24.03s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1280/1444 [8:42:51<1:05:00, 23.78s/it] {'loss': 0.5445, 'grad_norm': 0.1958320140838623, 'learning_rate': 3.3835396429599152e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 981.25, 'epoch': 1.77}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1280/1444 [8:42:51<1:05:00, 23.78s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1281/1444 [8:43:13<1:03:40, 23.44s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1282/1444 [8:43:38<1:04:28, 23.88s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1283/1444 [8:44:00<1:02:21, 23.24s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1284/1444 [8:44:24<1:02:44, 23.53s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1285/1444 [8:44:48<1:02:40, 23.65s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1286/1444 [8:45:12<1:02:41, 23.81s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1287/1444 [8:45:39<1:04:20, 24.59s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1288/1444 [8:46:05<1:05:10, 25.06s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1289/1444 [8:46:27<1:02:31, 24.20s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1290/1444 [8:46:54<1:03:57, 24.92s/it] {'loss': 0.5358, 'grad_norm': 0.1918836534023285, 'learning_rate': 2.9898566847861245e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 909.78, 'epoch': 1.79}
89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1290/1444 [8:46:54<1:03:57, 24.92s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1291/1444 [8:47:17<1:02:31, 24.52s/it] 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1292/1444 [8:47:41<1:01:35, 24.31s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1293/1444 [8:48:02<58:56, 23.42s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1294/1444 [8:48:25<57:58, 23.19s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1295/1444 [8:48:47<56:57, 22.94s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1296/1444 [8:49:12<58:03, 23.54s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1297/1444 [8:49:38<59:05, 24.12s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1298/1444 [8:50:04<1:00:26, 24.84s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1299/1444 [8:50:27<58:37, 24.26s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1300/1444 [8:50:52<58:46, 24.49s/it] {'loss': 0.5728, 'grad_norm': 0.18790839612483978, 'learning_rate': 2.619810971346587e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 961.21, 'epoch': 1.8}
90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1300/1444 [8:50:52<58:46, 24.49s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1301/1444 [8:51:17<58:19, 24.47s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1302/1444 [8:51:45<1:00:36, 25.61s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1303/1444 [8:52:13<1:01:32, 26.19s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1304/1444 [8:52:34<57:55, 24.83s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1305/1444 [8:52:58<56:43, 24.48s/it] 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1306/1444 [8:53:21<55:40, 24.21s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1307/1444 [8:53:45<54:47, 23.99s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1308/1444 [8:54:12<56:47, 25.06s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1309/1444 [8:54:39<57:36, 25.60s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1310/1444 [8:55:05<57:10, 25.60s/it] {'loss': 0.5943, 'grad_norm': 0.18140196800231934, 'learning_rate': 2.2735885659134925e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 933.61, 'epoch': 1.81}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1310/1444 [8:55:05<57:10, 25.60s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1311/1444 [8:55:26<54:03, 24.39s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1312/1444 [8:55:54<55:29, 25.22s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1313/1444 [8:56:16<53:09, 24.35s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1314/1444 [8:56:41<53:01, 24.47s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1315/1444 [8:57:06<53:14, 24.77s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1316/1444 [8:57:30<52:20, 24.54s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1317/1444 [8:57:53<50:58, 24.08s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1318/1444 [8:58:16<50:04, 23.84s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1319/1444 [8:58:41<49:49, 23.91s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1320/1444 [8:59:06<50:10, 24.28s/it] {'loss': 0.5557, 'grad_norm': 0.19009986519813538, 'learning_rate': 1.9513635531237417e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 913.45, 'epoch': 1.83}
91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1320/1444 [8:59:06<50:10, 24.28s/it] 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1321/1444 [8:59:28<48:22, 23.59s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1322/1444 [8:59:54<49:47, 24.49s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1323/1444 [9:00:20<50:00, 24.79s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1324/1444 [9:00:49<52:00, 26.00s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1325/1444 [9:01:15<51:38, 26.04s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1326/1444 [9:01:39<50:09, 25.50s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1327/1444 [9:02:02<48:32, 24.89s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1328/1444 [9:02:25<46:58, 24.30s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1329/1444 [9:02:48<45:42, 23.85s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1330/1444 [9:03:11<44:59, 23.68s/it] {'loss': 0.5853, 'grad_norm': 0.1914367824792862, 'learning_rate': 1.6532979514471747e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1024.91, 'epoch': 1.84}
92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1330/1444 [9:03:11<44:59, 23.68s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1331/1444 [9:03:35<44:39, 23.71s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1332/1444 [9:03:58<43:45, 23.44s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1333/1444 [9:04:20<42:27, 22.95s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1334/1444 [9:04:44<42:48, 23.35s/it] 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1335/1444 [9:05:10<43:32, 23.97s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1336/1444 [9:05:32<42:21, 23.53s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1337/1444 [9:05:57<42:59, 24.10s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1338/1444 [9:06:22<42:34, 24.10s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1339/1444 [9:06:45<41:39, 23.81s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1340/1444 [9:07:08<40:48, 23.54s/it] {'loss': 0.5669, 'grad_norm': 0.1720149666070938, 'learning_rate': 1.3795416317218036e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1067.62, 'epoch': 1.86}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1340/1444 [9:07:08<40:48, 23.54s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1341/1444 [9:07:31<40:34, 23.63s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1342/1444 [9:07:57<41:01, 24.13s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1343/1444 [9:08:23<41:54, 24.89s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1344/1444 [9:08:51<42:43, 25.64s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1345/1444 [9:09:16<42:00, 25.46s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1346/1444 [9:09:41<41:16, 25.27s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1347/1444 [9:10:05<40:18, 24.93s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1348/1444 [9:10:29<39:25, 24.64s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1349/1444 [9:10:56<40:10, 25.38s/it] 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1350/1444 [9:11:20<39:14, 25.05s/it] {'loss': 0.5802, 'grad_norm': 0.17772097885608673, 'learning_rate': 1.1302322417970135e-06, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 940.57, 'epoch': 1.87}
93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1350/1444 [9:11:20<39:14, 25.05s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1351/1444 [9:11:42<37:13, 24.02s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1352/1444 [9:12:04<36:05, 23.53s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž| 1353/1444 [9:12:30<36:35, 24.12s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1354/1444 [9:12:50<34:36, 23.07s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1355/1444 [9:13:15<35:08, 23.69s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1356/1444 [9:13:42<36:10, 24.66s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1357/1444 [9:14:07<35:40, 24.60s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1358/1444 [9:14:30<34:48, 24.29s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1359/1444 [9:14:57<35:15, 24.89s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1360/1444 [9:15:21<34:45, 24.83s/it] {'loss': 0.5436, 'grad_norm': 0.1829637587070465, 'learning_rate': 9.054951373226484e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 931.71, 'epoch': 1.88}
94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1360/1444 [9:15:21<34:45, 24.83s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1361/1444 [9:15:47<34:33, 24.98s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1362/1444 [9:16:12<34:19, 25.12s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1363/1444 [9:16:35<33:04, 24.50s/it] 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1364/1444 [9:16:57<31:42, 23.78s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1365/1444 [9:17:23<31:55, 24.25s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1366/1444 [9:17:49<32:11, 24.77s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1367/1444 [9:18:13<31:44, 24.73s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1368/1444 [9:18:40<32:08, 25.37s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1369/1444 [9:19:08<32:29, 25.99s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1370/1444 [9:19:28<29:58, 24.31s/it] {'loss': 0.5751, 'grad_norm': 0.190989688038826, 'learning_rate': 7.054433187187071e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1210.22, 'epoch': 1.9}
95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1370/1444 [9:19:28<29:58, 24.31s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 1371/1444 [9:19:52<29:37, 24.35s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1372/1444 [9:20:16<29:03, 24.22s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1373/1444 [9:20:41<28:45, 24.30s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1374/1444 [9:21:02<27:20, 23.44s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1375/1444 [9:21:29<28:01, 24.36s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1376/1444 [9:21:55<28:19, 24.99s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1377/1444 [9:22:22<28:37, 25.63s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1378/1444 [9:22:48<28:11, 25.64s/it] 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1379/1444 [9:23:07<25:44, 23.76s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1380/1444 [9:23:31<25:11, 23.61s/it] {'loss': 0.5524, 'grad_norm': 0.17515617609024048, 'learning_rate': 5.301773743574712e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 972.68, 'epoch': 1.91}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1380/1444 [9:23:31<25:11, 23.61s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1381/1444 [9:23:54<24:49, 23.64s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1382/1444 [9:24:19<24:50, 24.04s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1383/1444 [9:24:47<25:33, 25.14s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1384/1444 [9:25:10<24:38, 24.64s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1385/1444 [9:25:36<24:26, 24.85s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1386/1444 [9:26:02<24:24, 25.26s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1387/1444 [9:26:25<23:23, 24.63s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1388/1444 [9:26:52<23:32, 25.22s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 1389/1444 [9:27:17<23:04, 25.18s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1390/1444 [9:27:44<23:04, 25.63s/it] {'loss': 0.5438, 'grad_norm': 0.17383797466754913, 'learning_rate': 3.7978542998643454e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 915.78, 'epoch': 1.93}
96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1390/1444 [9:27:44<23:04, 25.63s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1391/1444 [9:28:06<21:41, 24.55s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1392/1444 [9:28:33<22:08, 25.55s/it] 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1393/1444 [9:28:55<20:35, 24.22s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1394/1444 [9:29:22<20:54, 25.10s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1395/1444 [9:29:46<20:22, 24.95s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1396/1444 [9:30:10<19:32, 24.42s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1397/1444 [9:30:35<19:19, 24.67s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1398/1444 [9:30:59<18:47, 24.52s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1399/1444 [9:31:23<18:15, 24.33s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1400/1444 [9:31:46<17:30, 23.89s/it] {'loss': 0.5667, 'grad_norm': 0.1715794801712036, 'learning_rate': 2.5434310441773137e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 1099.45, 'epoch': 1.94}
97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1400/1444 [9:31:46<17:30, 23.89s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1401/1444 [9:32:09<17:02, 23.77s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1402/1444 [9:32:31<16:10, 23.10s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1403/1444 [9:32:55<16:01, 23.45s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1404/1444 [9:33:18<15:34, 23.37s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1405/1444 [9:33:41<15:04, 23.19s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1406/1444 [9:34:05<14:48, 23.37s/it] 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 1407/1444 [9:34:30<14:46, 23.96s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1408/1444 [9:34:56<14:38, 24.41s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1409/1444 [9:35:21<14:21, 24.61s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1410/1444 [9:35:45<13:59, 24.68s/it] {'loss': 0.542, 'grad_norm': 0.1821635514497757, 'learning_rate': 1.5391347150607304e-07, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 949.21, 'epoch': 1.95}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1410/1444 [9:35:45<13:59, 24.68s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1411/1444 [9:36:09<13:21, 24.28s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1412/1444 [9:36:31<12:39, 23.72s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1413/1444 [9:36:54<12:08, 23.50s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1414/1444 [9:37:19<11:53, 23.77s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1415/1444 [9:37:43<11:33, 23.92s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1416/1444 [9:38:09<11:25, 24.46s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1417/1444 [9:38:34<11:04, 24.60s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1418/1444 [9:38:56<10:22, 23.96s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1419/1444 [9:39:20<10:00, 24.02s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1420/1444 [9:39:49<10:11, 25.46s/it] {'loss': 0.5384, 'grad_norm': 0.19293761253356934, 'learning_rate': 7.854702843449469e-08, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 825.94, 'epoch': 1.97}
98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1420/1444 [9:39:49<10:11, 25.46s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1421/1444 [9:40:13<09:34, 24.96s/it] 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1422/1444 [9:40:38<09:09, 24.96s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1423/1444 [9:41:05<08:59, 25.69s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1424/1444 [9:41:31<08:37, 25.85s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 1425/1444 [9:41:57<08:08, 25.69s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1426/1444 [9:42:23<07:47, 25.95s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1427/1444 [9:42:45<07:02, 24.83s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1428/1444 [9:43:13<06:49, 25.61s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1429/1444 [9:43:37<06:16, 25.07s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1430/1444 [9:44:02<05:50, 25.03s/it] {'loss': 0.6172, 'grad_norm': 0.19018813967704773, 'learning_rate': 2.828167032379869e-08, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 973.31, 'epoch': 1.98}
99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1430/1444 [9:44:02<05:50, 25.03s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1431/1444 [9:44:24<05:15, 24.25s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1432/1444 [9:44:49<04:53, 24.46s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1433/1444 [9:45:15<04:34, 24.95s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1434/1444 [9:45:40<04:09, 24.94s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1435/1444 [9:46:05<03:45, 25.08s/it] 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1436/1444 [9:46:28<03:13, 24.24s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1437/1444 [9:46:54<02:55, 25.01s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1438/1444 [9:47:18<02:27, 24.56s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1439/1444 [9:47:43<02:03, 24.72s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1440/1444 [9:48:08<01:39, 24.86s/it] {'loss': 0.5543, 'grad_norm': 0.180649071931839, 'learning_rate': 3.1426711784299233e-09, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'tokens_per_second_per_gpu': 945.3, 'epoch': 1.99}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1440/1444 [9:48:08<01:39, 24.86s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1441/1444 [9:48:33<01:14, 24.94s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1442/1444 [9:48:58<00:49, 24.98s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1443/1444 [9:49:23<00:24, 24.92s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1444/1444 [9:49:34<00:00, 20.65s/it][2025-12-27 18:21:40,804] [INFO] [axolotl.core.trainers.base._save:671] [PID:8935] Saving model checkpoint to ./outputs/qwen32b-thai/checkpoint-1444
{'train_runtime': 35379.7585, 'train_samples_per_second': 1.306, 'train_steps_per_second': 0.041, 'train_loss': 0.6086704093663646, 'memory/max_active (GiB)': 85.72, 'memory/max_allocated (GiB)': 85.72, 'memory/device_reserved (GiB)': 90.59, 'epoch': 2.0}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1444/1444 [9:49:39<00:00, 20.65s/it] 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1444/1444 [9:49:39<00:00, 24.50s/it]
[2025-12-27 18:21:46,260] [INFO] [axolotl.train.save_trained_model:218] [PID:8935] Training completed! Saving trained model to ./outputs/qwen32b-thai.
[2025-12-27 18:21:47,839] [INFO] [axolotl.train.save_trained_model:336] [PID:8935] Model successfully saved to ./outputs/qwen32b-thai