config.json: 0.00B [00:00, ?B/s]config.json: 4.62kB [00:00, 22.1MB/s]
[2026-04-29 00:28:59,993] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:3345] baseline 0.000GB ()
[2026-04-29 00:28:59,994] [INFO] [axolotl.cli.config.load_cfg:341] [PID:3345] config:
{
  "activation_offloading": true,
  "adapter": "lora",
  "axolotl_config_path": "Mura.yaml",
  "base_model": "google/gemma-4-31B-it",
  "base_model_config": "google/gemma-4-31B-it",
  "batch_size": 8,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_120",
    "fp8": true,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "chat_template": "jinja",
  "chat_template_jinja": "{%- macro strip_thinking(text) -%}\n    {%- set ns = namespace(result='') -%}\n    {%- for part in text.split('<channel|>') -%}\n        {%- if '<|channel>' in part -%}\n            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n        {%- else -%}\n            {%- set ns.result = ns.result + part -%}\n        {%- endif -%}\n    {%- endfor -%}\n    {{- ns.result | trim -}}\n{%- endmacro -%}\n{%- set loop_messages = messages -%} {{ bos_token }}\n{#- Handle System Definitions Block -#} {%- if (enable_thinking is defined and enable_thinking) or messages[0]['role'] in ['system', 'developer'] -%}\n    {{- '<|turn>system\\n' -}}\n\n    {#- Inject Thinking token at the very top of the FIRST system turn -#}\n    {%- if enable_thinking is defined and enable_thinking -%}\n        {{- '<|think|>' -}}\n    {%- endif -%}\n\n    {%- if messages[0]['role'] in ['system', 'developer'] -%}\n        {{- messages[0]['content'] | trim -}}\n        {%- set loop_messages = messages[1:] -%}\n    {%- endif -%}\n    \n    {{- '<turn|>\\n' -}}\n{%- endif %}\n{#- Loop through messages -#} {%- for message in loop_messages -%}\n    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n    {{- '<|turn>' + role + '\\n' -}}\n\n    {#- Flag to identify the final SFT turn -#}\n    {%- set is_final_sft_turn = loop.last and not add_generation_prompt -%}\n\n    {%- if message['content'] is string -%}\n        {%- if role == 'model' -%}\n            {%- if is_final_sft_turn and '<|channel>thought' not in message['content'] -%}\n                {{- '<|channel>thought\\n<channel|>' -}}\n            {%- endif -%}\n            {{- strip_thinking(message['content']) -}}\n        {%- else -%}\n            {{- message['content'] | trim -}}\n        {%- endif -%}\n    {%- elif message['content'] is sequence -%}\n        {%- set ns = namespace(has_thinking=false) -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'text' and '<|channel>thought' in item['text'] -%}\n                {%- set ns.has_thinking = true -%}\n            {%- endif -%}\n        {%- endfor -%}\n        \n        {%- if role == 'model' and is_final_sft_turn and not ns.has_thinking -%}\n            {{- '<|channel>thought\\n<channel|>' -}}\n        {%- endif -%}\n\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'text' -%}\n                {%- if role == 'model' -%}\n                    {{- strip_thinking(item['text']) -}}\n                {%- else -%}\n                    {{- item['text'] | trim -}}\n                {%- endif -%}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- endif -%}\n\n    {{- '<turn|>\\n' -}}\n{%- endfor -%}\n{#- Generation Prompt handled as normal (serves as the final turn when true) -#} {%- if add_generation_prompt -%}\n    {{- '<|turn>model\\n' -}}\n    {%- if not enable_thinking | default(false) -%}\n        {{- '<|channel>thought\\n<channel|>' -}}\n    {%- endif -%}\n{%- endif -%}\n",
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "ConicCat/Mura_Books",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eot_tokens": [
    "<turn|>"
  ],
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": false,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "hf_use_auth_token": true,
  "include_tkps": true,
  "is_multimodal": true,
  "layer_offloading": false,
  "learning_rate": 2.5e-05,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rms_norm_gated": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 64,
  "lora_dropout": 0.0,
  "lora_mlp_kernel": false,
  "lora_o_kernel": false,
  "lora_qkv_kernel": false,
  "lora_r": 32,
  "lora_target_modules": "model.language_model.layers.[\\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj",
  "loraplus_lr_embedding": 1e-06,
  "loraplus_lr_ratio": 16.0,
  "lr_scheduler": "cosine",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 2,
  "model_config_type": "gemma4",
  "model_config_type_text": "gemma4_text",
  "num_epochs": 11.0,
  "num_generation_samples": 3,
  "optimizer": "paged_adamw_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./Writer-Stage-2",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "google/gemma-4-31B-it",
  "profiler_steps_start": 0,
  "push_dataset_to_hub": "ConicCat/Gemma4-Mura",
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_strategy": "no",
  "sdp_attention": true,
  "seed": 42,
  "sequence_len": 2048,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "google/gemma-4-31B-it",
  "tokenizer_save_jinja_files": true,
  "torch_compile": false,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_tensorboard": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.05,
  "weight_decay": 0.0,
  "world_size": 1
}
tokenizer_config.json: 0.00B [00:00, ?B/s]tokenizer_config.json: 2.10kB [00:00, 9.15MB/s]
tokenizer.json:   0%|                                                                                                                                                                                                                                                                                                                                                                                                      | 0.00/32.2M [00:00<?, ?B/s]tokenizer.json:   0%|                                                                                                                                                                                                                                                                                                                                                                                                      | 0.00/32.2M [00:00<?, ?B/s]tokenizer.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32.2M/32.2M [00:00<00:00, 161MB/s]tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32.2M/32.2M [00:00<00:00, 52.9MB/s]
chat_template.jinja: 0.00B [00:00, ?B/s]chat_template.jinja: 16.9kB [00:00, 50.3MB/s]
[2026-04-29 00:29:02,981] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:3345] EOS: 1 / <eos>
[2026-04-29 00:29:02,981] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3345] BOS: 2 / <bos>
[2026-04-29 00:29:02,981] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3345] PAD: 0 / <pad>
[2026-04-29 00:29:02,982] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3345] UNK: 3 / <unk>
[2026-04-29 00:29:02,987] [INFO] [axolotl.utils.data.shared.try_load_from_hub:491] [PID:3345] Attempting to load prepared dataset from HuggingFace Hub at ConicCat/Gemma4-Mura (version cd09e8d53ec6484b42196b1f3b13c4dd)...
README.md:   0%|                                                                                                                                                                                                                                                                                                                                                                                                             | 0.00/523 [00:00<?, ?B/s]README.md: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 523/523 [00:00<00:00, 6.03MB/s]
cd09e8d53ec6484b42196b1f3b13c4dd/train-0(…):   0%|                                                                                                                                                                                                                                                                                                                                                                         | 0.00/1.24M [00:00<?, ?B/s]cd09e8d53ec6484b42196b1f3b13c4dd/train-0(…):   0%|                                                                                                                                                                                                                                                                                                                                                                         | 0.00/1.24M [00:00<?, ?B/s]cd09e8d53ec6484b42196b1f3b13c4dd/train-0(…): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.24M/1.24M [00:00<00:00, 6.23MB/s]cd09e8d53ec6484b42196b1f3b13c4dd/train-0(…): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.24M/1.24M [00:00<00:00, 3.08MB/s]
Generating train split:   0%|                                                                                                                                                                                                                                                                                                                                                                                           | 0/675 [00:00<?, ? examples/s]Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 675/675 [00:00<00:00, 18922.18 examples/s]
[2026-04-29 00:29:04,443] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:3345] total_num_tokens: 350_244
[2026-04-29 00:29:04,452] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:3345] `total_supervised_tokens: 250_020`
[2026-04-29 00:29:05,691] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.6142261028289795
[2026-04-29 00:29:06,286] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.5948703289031982
[2026-04-29 00:29:06,883] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.5964534282684326
[2026-04-29 00:29:07,499] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.6151058673858643
[2026-04-29 00:29:07,525] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:3345] gather_len_batches: [91]
[2026-04-29 00:29:07,527] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:3345] data_loader_len: 22
[2026-04-29 00:29:07,528] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:3345] sample_packing_eff_est across ranks: [0.9396570226648352]
[2026-04-29 00:29:07,528] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:3345] sample_packing_eff_est: 0.94
[2026-04-29 00:29:07,529] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:3345] total_num_steps: 242
[2026-04-29 00:29:07,529] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:3345] Maximum number of steps set at 242
[2026-04-29 00:29:07,616] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:3345] loading tokenizer... google/gemma-4-31B-it
[2026-04-29 00:29:09,310] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:3345] EOS: 1 / <eos>
[2026-04-29 00:29:09,311] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:3345] BOS: 2 / <bos>
[2026-04-29 00:29:09,311] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:3345] PAD: 0 / <pad>
[2026-04-29 00:29:09,311] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:3345] UNK: 3 / <unk>
processor_config.json: 0.00B [00:00, ?B/s]processor_config.json: 1.69kB [00:00, 9.14MB/s]
[2026-04-29 00:29:13,167] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:3345] Loading model
[2026-04-29 00:29:13,230] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:3345] Patched OptimState8bit for torch.compile compatibility
[2026-04-29 00:29:13,231] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:3345] Patched OptimState4bit for torch.compile compatibility
[2026-04-29 00:29:13,231] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:3345] Patched OptimStateFp8 for torch.compile compatibility
[2026-04-29 00:29:13,234] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:3345] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-04-29 00:29:13,235] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:3345] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-04-29 00:29:13,236] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:478] [PID:3345] Applying multipack dataloader patch for sample packing...
[2026-04-29 00:29:13,249] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:3345] Applying Cut Cross Entropy to model type: gemma4
[2026-04-29 00:29:13,382] [WARNING] [axolotl.integrations.liger.plugin.pre_model_load:241] [PID:3345] Unsupported model config type: gemma4. Liger not applied.
model.safetensors.index.json: 0.00B [00:00, ?B/s]model.safetensors.index.json: 120kB [00:00, 87.5MB/s]
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 2 files:   0%|                                                                                                                                                                                                                                                                                                                                                                                                          | 0/2 [00:00<?, ?it/s][ADownloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                                                                                                   | 0.00/49.8G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                                                                                                   | 0.00/62.5G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                                                                                                   | 0.00/62.5G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                                                                                         | 413k/62.5G [00:00<9:40:03, 1.80MB/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                                                                                        | 544k/62.5G [00:00<14:49:26, 1.17MB/s]Downloading (incomplete total...):   0%|▍                                                                                                                                                                                                                                                                                                                                                                          | 67.5M/62.5G [00:00<07:05, 147MB/s]Downloading (incomplete total...):   1%|█▉                                                                                                                                                                                                                                                                                                                                                                          | 336M/62.5G [00:01<01:45, 587MB/s]Downloading (incomplete total...):   2%|█████▊                                                                                                                                                                                                                                                                                                                                                                    | 1.01G/62.5G [00:01<00:36, 1.68GB/s]Downloading (incomplete total...):   3%|████████████▍                                                                                                                                                                                                                                                                                                                                                             | 2.15G/62.5G [00:01<00:29, 2.05GB/s]Downloading (incomplete total...):   6%|████████████████████▉                                                                                                                                                                                                                                                                                                                                                     | 3.62G/62.5G [00:02<00:21, 2.76GB/s]Downloading (incomplete total...):   8%|███████████████████████████▍                                                                                                                                                                                                                                                                                                                                              | 4.74G/62.5G [00:02<00:17, 3.34GB/s]Downloading (incomplete total...):  10%|███████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                      | 6.09G/62.5G [00:02<00:19, 2.87GB/s]Downloading (incomplete total...):  12%|█████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                                | 7.21G/62.5G [00:03<00:15, 3.50GB/s]Downloading (incomplete total...):  13%|███████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                          | 8.28G/62.5G [00:03<00:16, 3.21GB/s]Downloading (incomplete total...):  15%|██████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                   | 9.46G/62.5G [00:03<00:14, 3.72GB/s]Downloading (incomplete total...):  17%|████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                              | 10.4G/62.5G [00:03<00:14, 3.55GB/s]Downloading (incomplete total...):  19%|███████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                       | 11.6G/62.5G [00:04<00:13, 3.70GB/s]Downloading (incomplete total...):  20%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                | 12.7G/62.5G [00:04<00:13, 3.78GB/s]Downloading (incomplete total...):  22%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                          | 13.8G/62.5G [00:04<00:13, 3.70GB/s]Downloading (incomplete total...):  24%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                   | 14.9G/62.5G [00:05<00:13, 3.60GB/s]Downloading (incomplete total...):  25%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                              | 15.9G/62.5G [00:05<00:13, 3.59GB/s]Downloading (incomplete total...):  27%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                       | 17.1G/62.5G [00:06<00:15, 2.86GB/s]Downloading (incomplete total...):  29%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                | 18.3G/62.5G [00:06<00:12, 3.67GB/s]Downloading (incomplete total...):  31%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                          | 19.3G/62.5G [00:06<00:12, 3.48GB/s]Downloading (incomplete total...):  32%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                    | 20.3G/62.5G [00:06<00:10, 4.03GB/s]Downloading (incomplete total...):  34%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                             | 21.5G/62.5G [00:07<00:13, 3.03GB/s]Downloading (incomplete total...):  36%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                       | 22.6G/62.5G [00:07<00:13, 2.88GB/s]Downloading (incomplete total...):  38%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                | 23.8G/62.5G [00:07<00:11, 3.48GB/s]Downloading (incomplete total...):  40%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                          | 24.9G/62.5G [00:08<00:15, 2.41GB/s]Downloading (incomplete total...):  44%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                            | 27.2G/62.5G [00:10<00:17, 2.01GB/s]Downloading (incomplete total...):  47%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                | 29.3G/62.5G [00:10<00:15, 2.17GB/s]Downloading (incomplete total...):  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                   | 31.5G/62.5G [00:11<00:09, 3.15GB/s]Downloading (incomplete total...):  52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                             | 32.6G/62.5G [00:11<00:10, 2.73GB/s]Downloading (incomplete total...):  54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                       | 33.7G/62.5G [00:12<00:11, 2.45GB/s]Downloading (incomplete total...):  56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                | 34.8G/62.5G [00:12<00:12, 2.26GB/s]Downloading (incomplete total...):  57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                          | 35.8G/62.5G [00:13<00:12, 2.10GB/s]Downloading (incomplete total...):  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                   | 37.0G/62.5G [00:14<00:12, 2.07GB/s]Downloading (incomplete total...):  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                             | 38.1G/62.5G [00:14<00:13, 1.81GB/s]Downloading (incomplete total...):  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                       | 39.2G/62.5G [00:15<00:12, 1.81GB/s]Downloading (incomplete total...):  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                | 40.3G/62.5G [00:16<00:12, 1.81GB/s]Downloading (incomplete total...):  66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                         | 41.5G/62.5G [00:16<00:12, 1.71GB/s]Downloading (incomplete total...):  68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 42.7G/62.5G [00:17<00:12, 1.64GB/s]Downloading (incomplete total...):  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 43.8G/62.5G [00:18<00:12, 1.53GB/s]Downloading (incomplete total...):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 44.9G/62.5G [00:19<00:11, 1.48GB/s]Downloading (incomplete total...):  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 45.9G/62.5G [00:19<00:10, 1.54GB/s]Downloading (incomplete total...):  75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 47.1G/62.5G [00:20<00:09, 1.67GB/s]Downloading (incomplete total...):  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 48.2G/62.5G [00:21<00:08, 1.71GB/s]Downloading (incomplete total...):  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 49.3G/62.5G [00:21<00:07, 1.74GB/s]Downloading (incomplete total...):  81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 50.4G/62.5G [00:22<00:06, 1.76GB/s]Downloading (incomplete total...):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 51.5G/62.5G [00:22<00:06, 1.77GB/s]Downloading (incomplete total...):  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 52.7G/62.5G [00:23<00:05, 1.84GB/s]Downloading (incomplete total...):  86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 53.7G/62.5G [00:23<00:04, 2.02GB/s]Downloading (incomplete total...):  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 54.8G/62.5G [00:24<00:03, 1.95GB/s]Downloading (incomplete total...):  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 55.9G/62.5G [00:25<00:03, 1.91GB/s]Downloading (incomplete total...):  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 57.0G/62.5G [00:25<00:02, 1.90GB/s]Downloading (incomplete total...):  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 58.2G/62.5G [00:26<00:02, 1.93GB/s]Downloading (incomplete total...):  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 59.3G/62.5G [00:26<00:01, 2.10GB/s]Downloading (incomplete total...):  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 60.4G/62.5G [00:27<00:01, 2.00GB/s]Downloading (incomplete total...):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 61.4G/62.5G [00:27<00:00, 2.16GB/s]Downloading (incomplete total...):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 61.7G/62.5G [00:27<00:00, 2.03GB/s]Downloading (incomplete total...):  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 62.1G/62.5G [00:28<00:00, 2.05GB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.5G/62.5G [00:28<00:00, 2.04GB/s]
Fetching 2 files:  50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                 | 1/2 [00:28<00:28, 28.66s/it][AFetching 2 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:28<00:00, 14.33s/it]
Download complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.5G/62.5G [00:28<00:00, 2.04GB/s]Download complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.5G/62.5G [00:28<00:00, 2.18GB/s]
Loading weights:   0%|                                                                                                                                                                                                                                                                                                                                                                                                        | 0/1188 [00:00<?, ?it/s]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1188/1188 [00:00<00:00, 14919.56it/s]
generation_config.json:   0%|                                                                                                                                                                                                                                                                                                                                                                                                | 0.00/208 [00:00<?, ?B/s]generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 208/208 [00:00<00:00, 2.20MB/s]
[2026-04-29 00:29:44,196] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:3345] Converting modules to torch.bfloat16
[2026-04-29 00:29:45,059] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:3345] Memory usage after model load 0.000GB ()
trainable params: 244,858,880 || all params: 31,517,945,392 || trainable%: 0.7769
[2026-04-29 00:29:46,141] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:3345] after adapters 0.000GB ()
[2026-04-29 00:29:54,412] [INFO] [axolotl.train.save_initial_configs:417] [PID:3345] Pre-saving adapter config to ./Writer-Stage-2...
[2026-04-29 00:29:54,415] [INFO] [axolotl.train.save_initial_configs:421] [PID:3345] Pre-saving tokenizer to ./Writer-Stage-2...
[2026-04-29 00:29:54,723] [INFO] [axolotl.train.save_initial_configs:426] [PID:3345] Pre-saving model config to ./Writer-Stage-2...
[2026-04-29 00:29:54,728] [INFO] [axolotl.train.save_initial_configs:430] [PID:3345] Pre-saving processor to ./Writer-Stage-2...
[2026-04-29 00:29:55,018] [INFO] [axolotl.train.execute_training:222] [PID:3345] Starting trainer...
[2026-04-29 00:29:57,312] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.9119553565979004
[2026-04-29 00:29:58,228] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.9157748222351074
[2026-04-29 00:29:59,127] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.8982894420623779
[2026-04-29 00:30:00,027] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3345] generate_batches time: 0.8996067047119141
[2026-04-29 00:30:00,028] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:3345] gather_len_batches: [91]
  0%|                                                                                                                                                                                                                                                                                                                                                                                                                          | 0/242 [00:00<?, ?it/s]  0%|█▋                                                                                                                                                                                                                                                                                                                                                                                                              | 1/242 [00:32<2:09:37, 32.27s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '3.746', 'grad_norm': '1.207', 'learning_rate': '0', 'ppl': '42.35', 'memory/max_active (GiB)': '64.79', 'memory/max_allocated (GiB)': '64.79', 'memory/device_reserved (GiB)': '65.22', 'tokens/train_per_sec_per_gpu': '87.91', 'tokens/total': 16384, 'tokens/trainable': 11026, 'epoch': '0.04396'}
  0%|█▋                                                                                                                                                                                                                                                                                                                                                                                                              | 1/242 [00:32<2:09:37, 32.27s/it]  1%|███▎                                                                                                                                                                                                                                                                                                                                                                                                            | 2/242 [00:58<1:55:26, 28.86s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '3.811', 'grad_norm': '1.227', 'learning_rate': '2.083e-06', 'ppl': '45.18', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.29', 'tokens/train_per_sec_per_gpu': '100.1', 'tokens/total': 32768, 'tokens/trainable': 22113, 'epoch': '0.08791'}
  1%|███▎                                                                                                                                                                                                                                                                                                                                                                                                            | 2/242 [00:58<1:55:26, 28.86s/it]  1%|████▉                                                                                                                                                                                                                                                                                                                                                                                                           | 3/242 [01:25<1:50:41, 27.79s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '3.83', 'grad_norm': '1.353', 'learning_rate': '4.167e-06', 'ppl': '46.06', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '107.1', 'tokens/total': 49152, 'tokens/trainable': 33035, 'epoch': '0.1319'}
  1%|████▉                                                                                                                                                                                                                                                                                                                                                                                                           | 3/242 [01:25<1:50:41, 27.79s/it]  2%|██████▌                                                                                                                                                                                                                                                                                                                                                                                                         | 4/242 [01:51<1:48:17, 27.30s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '3.263', 'grad_norm': '1.114', 'learning_rate': '6.25e-06', 'ppl': '26.13', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '110.4', 'tokens/total': 65536, 'tokens/trainable': 44266, 'epoch': '0.1758'}
  2%|██████▌                                                                                                                                                                                                                                                                                                                                                                                                         | 4/242 [01:51<1:48:17, 27.30s/it]  2%|████████▎                                                                                                                                                                                                                                                                                                                                                                                                       | 5/242 [02:20<1:49:10, 27.64s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.885', 'grad_norm': '0.9089', 'learning_rate': '8.333e-06', 'ppl': '17.9', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '94.68', 'tokens/total': 81920, 'tokens/trainable': 55667, 'epoch': '0.2198'}
  2%|████████▎                                                                                                                                                                                                                                                                                                                                                                                                       | 5/242 [02:20<1:49:10, 27.64s/it]  2%|█████████▉                                                                                                                                                                                                                                                                                                                                                                                                      | 6/242 [02:46<1:47:15, 27.27s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.494', 'grad_norm': '0.4528', 'learning_rate': '1.042e-05', 'ppl': '12.11', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 98304, 'tokens/trainable': 67143, 'epoch': '0.2637'}
  2%|█████████▉                                                                                                                                                                                                                                                                                                                                                                                                      | 6/242 [02:46<1:47:15, 27.27s/it]  3%|███████████▌                                                                                                                                                                                                                                                                                                                                                                                                    | 7/242 [03:13<1:45:50, 27.03s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.345', 'grad_norm': '0.8443', 'learning_rate': '1.25e-05', 'ppl': '10.43', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '101.8', 'tokens/total': 114688, 'tokens/trainable': 78085, 'epoch': '0.3077'}
  3%|███████████▌                                                                                                                                                                                                                                                                                                                                                                                                    | 7/242 [03:13<1:45:50, 27.03s/it]  3%|█████████████▏                                                                                                                                                                                                                                                                                                                                                                                                  | 8/242 [03:39<1:44:46, 26.86s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.476', 'grad_norm': '1.456', 'learning_rate': '1.458e-05', 'ppl': '11.9', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '103.4', 'tokens/total': 131072, 'tokens/trainable': 88718, 'epoch': '0.3516'}
  3%|█████████████▏                                                                                                                                                                                                                                                                                                                                                                                                  | 8/242 [03:39<1:44:46, 26.86s/it]  4%|██████████████▉                                                                                                                                                                                                                                                                                                                                                                                                 | 9/242 [04:06<1:43:55, 26.76s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.529', 'grad_norm': '1.562', 'learning_rate': '1.667e-05', 'ppl': '12.54', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '105.7', 'tokens/total': 147456, 'tokens/trainable': 99662, 'epoch': '0.3956'}
  4%|██████████████▉                                                                                                                                                                                                                                                                                                                                                                                                 | 9/242 [04:06<1:43:55, 26.76s/it]  4%|████████████████▍                                                                                                                                                                                                                                                                                                                                                                                              | 10/242 [04:32<1:43:10, 26.68s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.467', 'grad_norm': '1.588', 'learning_rate': '1.875e-05', 'ppl': '11.78', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '105.7', 'tokens/total': 163840, 'tokens/trainable': 110330, 'epoch': '0.4396'}
  4%|████████████████▍                                                                                                                                                                                                                                                                                                                                                                                              | 10/242 [04:32<1:43:10, 26.68s/it]  5%|██████████████████▏                                                                                                                                                                                                                                                                                                                                                                                            | 11/242 [04:59<1:42:31, 26.63s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.404', 'grad_norm': '0.5222', 'learning_rate': '2.083e-05', 'ppl': '11.07', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '109', 'tokens/total': 180224, 'tokens/trainable': 121321, 'epoch': '0.4835'}
  5%|██████████████████▏                                                                                                                                                                                                                                                                                                                                                                                            | 11/242 [04:59<1:42:31, 26.63s/it]  5%|███████████████████▊                                                                                                                                                                                                                                                                                                                                                                                           | 12/242 [05:25<1:41:59, 26.61s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.317', 'grad_norm': '0.2528', 'learning_rate': '2.292e-05', 'ppl': '10.14', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.31', 'tokens/train_per_sec_per_gpu': '107.5', 'tokens/total': 196608, 'tokens/trainable': 132751, 'epoch': '0.5275'}
  5%|███████████████████▊                                                                                                                                                                                                                                                                                                                                                                                           | 12/242 [05:25<1:41:59, 26.61s/it]  5%|█████████████████████▍                                                                                                                                                                                                                                                                                                                                                                                         | 13/242 [05:52<1:41:25, 26.57s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.228', 'grad_norm': '0.5686', 'learning_rate': '2.5e-05', 'ppl': '9.284', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '94.91', 'tokens/total': 212992, 'tokens/trainable': 143654, 'epoch': '0.5714'}
  5%|█████████████████████▍                                                                                                                                                                                                                                                                                                                                                                                         | 13/242 [05:52<1:41:25, 26.57s/it]  6%|███████████████████████                                                                                                                                                                                                                                                                                                                                                                                        | 14/242 [06:18<1:40:55, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.237', 'grad_norm': '0.6978', 'learning_rate': '2.5e-05', 'ppl': '9.366', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '102.8', 'tokens/total': 229376, 'tokens/trainable': 154627, 'epoch': '0.6154'}
  6%|███████████████████████                                                                                                                                                                                                                                                                                                                                                                                        | 14/242 [06:18<1:40:55, 26.56s/it]  6%|████████████████████████▋                                                                                                                                                                                                                                                                                                                                                                                      | 15/242 [06:45<1:40:28, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.197', 'grad_norm': '0.5642', 'learning_rate': '2.5e-05', 'ppl': '8.994', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 245760, 'tokens/trainable': 165886, 'epoch': '0.6593'}
  6%|████████████████████████▋                                                                                                                                                                                                                                                                                                                                                                                      | 15/242 [06:45<1:40:28, 26.56s/it]  7%|██████████████████████████▍                                                                                                                                                                                                                                                                                                                                                                                    | 16/242 [07:11<1:40:01, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.176', 'grad_norm': '0.3654', 'learning_rate': '2.499e-05', 'ppl': '8.813', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '102.2', 'tokens/total': 262144, 'tokens/trainable': 176911, 'epoch': '0.7033'}
  7%|██████████████████████████▍                                                                                                                                                                                                                                                                                                                                                                                    | 16/242 [07:11<1:40:01, 26.55s/it]  7%|████████████████████████████                                                                                                                                                                                                                                                                                                                                                                                   | 17/242 [07:38<1:39:33, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.143', 'grad_norm': '0.1293', 'learning_rate': '2.498e-05', 'ppl': '8.527', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '101.3', 'tokens/total': 278528, 'tokens/trainable': 187935, 'epoch': '0.7473'}
  7%|████████████████████████████                                                                                                                                                                                                                                                                                                                                                                                   | 17/242 [07:38<1:39:33, 26.55s/it]  7%|█████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                                                 | 18/242 [08:04<1:39:04, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.137', 'grad_norm': '0.253', 'learning_rate': '2.497e-05', 'ppl': '8.472', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '105.1', 'tokens/total': 294912, 'tokens/trainable': 198910, 'epoch': '0.7912'}
  7%|█████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                                                 | 18/242 [08:04<1:39:04, 26.54s/it]  8%|███████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                                               | 19/242 [08:31<1:38:35, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.138', 'grad_norm': '0.4266', 'learning_rate': '2.496e-05', 'ppl': '8.479', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '105.7', 'tokens/total': 311296, 'tokens/trainable': 209640, 'epoch': '0.8352'}
  8%|███████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                                               | 19/242 [08:31<1:38:35, 26.53s/it]  8%|████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                                                              | 20/242 [08:57<1:38:09, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.094', 'grad_norm': '0.3504', 'learning_rate': '2.494e-05', 'ppl': '8.119', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '105.7', 'tokens/total': 327680, 'tokens/trainable': 220919, 'epoch': '0.8791'}
  8%|████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                                                              | 20/242 [08:57<1:38:09, 26.53s/it]  9%|██████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                            | 21/242 [09:24<1:37:41, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.038', 'grad_norm': '0.2918', 'learning_rate': '2.493e-05', 'ppl': '7.678', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 344064, 'tokens/trainable': 231622, 'epoch': '0.9231'}
  9%|██████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                            | 21/242 [09:24<1:37:41, 26.52s/it]  9%|████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                                          | 22/242 [09:50<1:37:11, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.101', 'grad_norm': '0.09805', 'learning_rate': '2.491e-05', 'ppl': '8.175', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '99.45', 'tokens/total': 360448, 'tokens/trainable': 241749, 'epoch': '0.967'}
  9%|████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                                          | 22/242 [09:50<1:37:11, 26.51s/it] 10%|█████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                                                         | 23/242 [10:10<1:29:34, 24.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.096', 'grad_norm': '0.1887', 'learning_rate': '2.488e-05', 'ppl': '8.131', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '127.7', 'tokens/total': 372736, 'tokens/trainable': 250020, 'epoch': '1'}
 10%|█████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                                                         | 23/242 [10:10<1:29:34, 24.54s/it] 10%|███████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                       | 24/242 [10:39<1:33:15, 25.67s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.937', 'grad_norm': '0.1604', 'learning_rate': '2.486e-05', 'ppl': '6.94', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '103.6', 'tokens/total': 389120, 'tokens/trainable': 260955, 'epoch': '1.044'}
 10%|███████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                       | 24/242 [10:39<1:33:15, 25.67s/it] 10%|█████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                                                     | 25/242 [11:05<1:33:44, 25.92s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.931', 'grad_norm': '0.2596', 'learning_rate': '2.483e-05', 'ppl': '6.899', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '100', 'tokens/total': 405504, 'tokens/trainable': 272111, 'epoch': '1.088'}
 10%|█████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                                                     | 25/242 [11:05<1:33:44, 25.92s/it] 11%|██████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                                    | 26/242 [11:32<1:33:57, 26.10s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.951', 'grad_norm': '0.2632', 'learning_rate': '2.48e-05', 'ppl': '7.036', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 421888, 'tokens/trainable': 283298, 'epoch': '1.132'}
 11%|██████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                                    | 26/242 [11:32<1:33:57, 26.10s/it] 11%|████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                  | 27/242 [11:58<1:33:57, 26.22s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.008', 'grad_norm': '0.5446', 'learning_rate': '2.477e-05', 'ppl': '7.445', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '104.9', 'tokens/total': 438272, 'tokens/trainable': 294390, 'epoch': '1.176'}
 11%|████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                                                  | 27/242 [11:58<1:33:57, 26.22s/it] 12%|██████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                                                | 28/242 [12:25<1:33:49, 26.31s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.938', 'grad_norm': '0.08229', 'learning_rate': '2.474e-05', 'ppl': '6.946', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '106', 'tokens/total': 454656, 'tokens/trainable': 305729, 'epoch': '1.22'}
 12%|██████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                                                | 28/242 [12:25<1:33:49, 26.31s/it] 12%|███████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                               | 29/242 [12:51<1:33:35, 26.36s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.983', 'grad_norm': '0.1499', 'learning_rate': '2.47e-05', 'ppl': '7.264', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '101', 'tokens/total': 471040, 'tokens/trainable': 316176, 'epoch': '1.264'}
 12%|███████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                               | 29/242 [12:51<1:33:35, 26.36s/it] 12%|█████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                                             | 30/242 [13:18<1:33:16, 26.40s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.053', 'grad_norm': '0.1291', 'learning_rate': '2.466e-05', 'ppl': '7.788', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '105.4', 'tokens/total': 487424, 'tokens/trainable': 326822, 'epoch': '1.308'}
 12%|█████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                                             | 30/242 [13:18<1:33:16, 26.40s/it] 13%|███████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                            | 31/242 [13:44<1:32:58, 26.44s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.989', 'grad_norm': '0.1126', 'learning_rate': '2.462e-05', 'ppl': '7.311', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '104.5', 'tokens/total': 503808, 'tokens/trainable': 337949, 'epoch': '1.352'}
 13%|███████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                            | 31/242 [13:44<1:32:58, 26.44s/it] 13%|████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                          | 32/242 [14:11<1:32:37, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.875', 'grad_norm': '0.1692', 'learning_rate': '2.458e-05', 'ppl': '6.52', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '98.8', 'tokens/total': 520192, 'tokens/trainable': 349121, 'epoch': '1.396'}
 13%|████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                                          | 32/242 [14:11<1:32:37, 26.46s/it] 14%|██████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                                        | 33/242 [14:37<1:32:15, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.971', 'grad_norm': '0.3518', 'learning_rate': '2.454e-05', 'ppl': '7.178', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 536576, 'tokens/trainable': 360427, 'epoch': '1.44'}
 14%|██████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                                        | 33/242 [14:37<1:32:15, 26.49s/it] 14%|████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                       | 34/242 [15:04<1:31:48, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.984', 'grad_norm': '0.1198', 'learning_rate': '2.449e-05', 'ppl': '7.268', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '100.8', 'tokens/total': 552960, 'tokens/trainable': 370757, 'epoch': '1.484'}
 14%|████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                       | 34/242 [15:04<1:31:48, 26.48s/it] 14%|█████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                     | 35/242 [15:30<1:31:23, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.927', 'grad_norm': '0.08251', 'learning_rate': '2.444e-05', 'ppl': '6.869', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '96.48', 'tokens/total': 569344, 'tokens/trainable': 381696, 'epoch': '1.527'}
 14%|█████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                     | 35/242 [15:30<1:31:23, 26.49s/it] 15%|███████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                   | 36/242 [15:57<1:30:58, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.921', 'grad_norm': '0.112', 'learning_rate': '2.439e-05', 'ppl': '6.828', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 585728, 'tokens/trainable': 392665, 'epoch': '1.571'}
 15%|███████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                   | 36/242 [15:57<1:30:58, 26.50s/it] 15%|█████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                  | 37/242 [16:23<1:30:33, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.97', 'grad_norm': '0.1157', 'learning_rate': '2.433e-05', 'ppl': '7.169', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '109', 'tokens/total': 602112, 'tokens/trainable': 403743, 'epoch': '1.615'}
 15%|█████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                                                  | 37/242 [16:23<1:30:33, 26.51s/it] 16%|██████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                | 38/242 [16:50<1:30:06, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '2.006', 'grad_norm': '0.1598', 'learning_rate': '2.428e-05', 'ppl': '7.434', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '104.9', 'tokens/total': 618496, 'tokens/trainable': 414605, 'epoch': '1.659'}
 16%|██████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                                                | 38/242 [16:50<1:30:06, 26.50s/it] 16%|████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                              | 39/242 [17:16<1:29:38, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.905', 'grad_norm': '0.1229', 'learning_rate': '2.422e-05', 'ppl': '6.718', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 634880, 'tokens/trainable': 425265, 'epoch': '1.703'}
 16%|████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                              | 39/242 [17:16<1:29:38, 26.49s/it] 17%|█████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                             | 40/242 [17:43<1:29:14, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.992', 'grad_norm': '0.162', 'learning_rate': '2.416e-05', 'ppl': '7.33', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.33', 'tokens/train_per_sec_per_gpu': '105', 'tokens/total': 651264, 'tokens/trainable': 436441, 'epoch': '1.747'}
 17%|█████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                             | 40/242 [17:43<1:29:14, 26.51s/it] 17%|███████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                           | 41/242 [18:09<1:28:53, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.879', 'grad_norm': '0.08294', 'learning_rate': '2.41e-05', 'ppl': '6.549', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.1', 'tokens/total': 667648, 'tokens/trainable': 447348, 'epoch': '1.791'}
 17%|███████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                           | 41/242 [18:09<1:28:53, 26.53s/it] 17%|█████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                         | 42/242 [18:36<1:28:28, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.897', 'grad_norm': '0.2545', 'learning_rate': '2.403e-05', 'ppl': '6.664', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '94.68', 'tokens/total': 684032, 'tokens/trainable': 458058, 'epoch': '1.835'}
 17%|█████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                         | 42/242 [18:36<1:28:28, 26.54s/it] 18%|██████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                        | 43/242 [19:03<1:28:05, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.901', 'grad_norm': '0.1428', 'learning_rate': '2.397e-05', 'ppl': '6.691', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.6', 'tokens/total': 700416, 'tokens/trainable': 469055, 'epoch': '1.879'}
 18%|██████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                                        | 43/242 [19:03<1:28:05, 26.56s/it] 18%|████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                      | 44/242 [19:29<1:27:40, 26.57s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.989', 'grad_norm': '0.1645', 'learning_rate': '2.39e-05', 'ppl': '7.311', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105.8', 'tokens/total': 716800, 'tokens/trainable': 480254, 'epoch': '1.923'}
 18%|████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                                      | 44/242 [19:29<1:27:40, 26.57s/it] 19%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                    | 45/242 [19:56<1:27:11, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.932', 'grad_norm': '0.1671', 'learning_rate': '2.382e-05', 'ppl': '6.905', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '94.64', 'tokens/total': 733184, 'tokens/trainable': 490960, 'epoch': '1.967'}
 19%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                                    | 45/242 [19:56<1:27:11, 26.56s/it] 19%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                   | 46/242 [20:16<1:20:17, 24.58s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.894', 'grad_norm': '0.1513', 'learning_rate': '2.375e-05', 'ppl': '6.649', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '115.1', 'tokens/total': 745472, 'tokens/trainable': 498416, 'epoch': '2'}
 19%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                                   | 46/242 [20:16<1:20:17, 24.58s/it] 19%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                 | 47/242 [20:44<1:23:13, 25.61s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.82', 'grad_norm': '0.1264', 'learning_rate': '2.368e-05', 'ppl': '6.171', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 761856, 'tokens/trainable': 509691, 'epoch': '2.044'}
 19%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                                 | 47/242 [20:44<1:23:13, 25.61s/it] 20%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                               | 48/242 [21:10<1:23:44, 25.90s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.744', 'grad_norm': '0.1457', 'learning_rate': '2.36e-05', 'ppl': '5.721', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.3', 'tokens/total': 778240, 'tokens/trainable': 521071, 'epoch': '2.088'}
 20%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                                               | 48/242 [21:10<1:23:44, 25.90s/it] 20%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                              | 49/242 [21:37<1:23:55, 26.09s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.723', 'grad_norm': '0.1251', 'learning_rate': '2.352e-05', 'ppl': '5.6', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105', 'tokens/total': 794624, 'tokens/trainable': 532280, 'epoch': '2.132'}
 20%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                                              | 49/242 [21:37<1:23:55, 26.09s/it] 21%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                            | 50/242 [22:03<1:23:54, 26.22s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.764', 'grad_norm': '0.1614', 'learning_rate': '2.344e-05', 'ppl': '5.834', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.7', 'tokens/total': 811008, 'tokens/trainable': 543523, 'epoch': '2.176'}
 21%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                            | 50/242 [22:03<1:23:54, 26.22s/it] 21%|████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                           | 51/242 [22:30<1:23:46, 26.31s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.745', 'grad_norm': '0.1363', 'learning_rate': '2.335e-05', 'ppl': '5.728', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 827392, 'tokens/trainable': 554899, 'epoch': '2.22'}
 21%|████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                           | 51/242 [22:30<1:23:46, 26.31s/it] 21%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                         | 52/242 [22:56<1:23:33, 26.39s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.707', 'grad_norm': '0.1613', 'learning_rate': '2.327e-05', 'ppl': '5.514', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.4', 'tokens/total': 843776, 'tokens/trainable': 566029, 'epoch': '2.264'}
 21%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                         | 52/242 [22:56<1:23:33, 26.39s/it] 22%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                       | 53/242 [23:23<1:23:19, 26.45s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.769', 'grad_norm': '0.1414', 'learning_rate': '2.318e-05', 'ppl': '5.866', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.1', 'tokens/total': 860160, 'tokens/trainable': 577309, 'epoch': '2.308'}
 22%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                                       | 53/242 [23:23<1:23:19, 26.45s/it] 22%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                      | 54/242 [23:50<1:22:58, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.724', 'grad_norm': '0.1071', 'learning_rate': '2.309e-05', 'ppl': '5.609', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.5', 'tokens/total': 876544, 'tokens/trainable': 588468, 'epoch': '2.352'}
 22%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                                                      | 54/242 [23:50<1:22:58, 26.48s/it] 23%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                    | 55/242 [24:16<1:22:38, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.736', 'grad_norm': '0.09143', 'learning_rate': '2.3e-05', 'ppl': '5.673', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '98.34', 'tokens/total': 892928, 'tokens/trainable': 599214, 'epoch': '2.396'}
 23%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                                    | 55/242 [24:16<1:22:38, 26.51s/it] 23%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                  | 56/242 [24:43<1:22:14, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.727', 'grad_norm': '0.1362', 'learning_rate': '2.291e-05', 'ppl': '5.621', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.4', 'tokens/total': 909312, 'tokens/trainable': 610243, 'epoch': '2.44'}
 23%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                                  | 56/242 [24:43<1:22:14, 26.53s/it] 24%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                 | 57/242 [25:09<1:21:50, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.669', 'grad_norm': '0.2245', 'learning_rate': '2.281e-05', 'ppl': '5.306', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.86', 'tokens/total': 925696, 'tokens/trainable': 620995, 'epoch': '2.484'}
 24%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                                 | 57/242 [25:09<1:21:50, 26.54s/it] 24%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                               | 58/242 [25:36<1:21:24, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.748', 'grad_norm': '0.1339', 'learning_rate': '2.271e-05', 'ppl': '5.742', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 942080, 'tokens/trainable': 631730, 'epoch': '2.527'}
 24%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                                               | 58/242 [25:36<1:21:24, 26.55s/it] 24%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                             | 59/242 [26:02<1:20:59, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.761', 'grad_norm': '0.1065', 'learning_rate': '2.261e-05', 'ppl': '5.818', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '99.11', 'tokens/total': 958464, 'tokens/trainable': 642698, 'epoch': '2.571'}
 24%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                                             | 59/242 [26:02<1:20:59, 26.55s/it] 25%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                            | 60/242 [26:29<1:20:32, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.686', 'grad_norm': '0.09912', 'learning_rate': '2.251e-05', 'ppl': '5.396', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.2', 'tokens/total': 974848, 'tokens/trainable': 653417, 'epoch': '2.615'}
 25%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                                            | 60/242 [26:29<1:20:32, 26.55s/it] 25%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                          | 61/242 [26:55<1:20:04, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.748', 'grad_norm': '0.1101', 'learning_rate': '2.241e-05', 'ppl': '5.743', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.48', 'tokens/total': 991232, 'tokens/trainable': 663602, 'epoch': '2.659'}
 25%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                          | 61/242 [26:56<1:20:04, 26.55s/it] 26%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                        | 62/242 [27:22<1:19:39, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.684', 'grad_norm': '0.1881', 'learning_rate': '2.23e-05', 'ppl': '5.388', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '109.6', 'tokens/total': 1007616, 'tokens/trainable': 674970, 'epoch': '2.703'}
 26%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                        | 62/242 [27:22<1:19:39, 26.55s/it] 26%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                       | 63/242 [27:49<1:19:13, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.746', 'grad_norm': '0.1046', 'learning_rate': '2.22e-05', 'ppl': '5.729', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.6', 'tokens/total': 1024000, 'tokens/trainable': 685934, 'epoch': '2.747'}
 26%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                       | 63/242 [27:49<1:19:13, 26.56s/it] 26%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                     | 64/242 [28:15<1:18:48, 26.57s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.677', 'grad_norm': '0.3198', 'learning_rate': '2.209e-05', 'ppl': '5.347', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.02', 'tokens/total': 1040384, 'tokens/trainable': 696773, 'epoch': '2.791'}
 26%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                     | 64/242 [28:15<1:18:48, 26.57s/it] 27%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                   | 65/242 [28:42<1:18:23, 26.57s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.596', 'grad_norm': '6.939', 'learning_rate': '2.198e-05', 'ppl': '4.932', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '92.57', 'tokens/total': 1056768, 'tokens/trainable': 707375, 'epoch': '2.835'}
 27%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                                   | 65/242 [28:42<1:18:23, 26.57s/it] 27%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                  | 66/242 [29:08<1:17:55, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.72', 'grad_norm': '0.09935', 'learning_rate': '2.187e-05', 'ppl': '5.586', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 1073152, 'tokens/trainable': 717931, 'epoch': '2.879'}
 27%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                                  | 66/242 [29:08<1:17:55, 26.56s/it] 28%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                | 67/242 [29:35<1:17:26, 26.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.746', 'grad_norm': '0.0881', 'learning_rate': '2.175e-05', 'ppl': '5.73', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '92.86', 'tokens/total': 1089536, 'tokens/trainable': 728879, 'epoch': '2.923'}
 28%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                                | 67/242 [29:35<1:17:26, 26.55s/it] 28%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                               | 68/242 [30:01<1:16:57, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.747', 'grad_norm': '0.1023', 'learning_rate': '2.164e-05', 'ppl': '5.738', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.49', 'tokens/total': 1105920, 'tokens/trainable': 739617, 'epoch': '2.967'}
 28%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                               | 68/242 [30:01<1:16:57, 26.54s/it] 29%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                             | 69/242 [30:21<1:10:50, 24.57s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.651', 'grad_norm': '0.09832', 'learning_rate': '2.152e-05', 'ppl': '5.212', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '140.1', 'tokens/total': 1118208, 'tokens/trainable': 747950, 'epoch': '3'}
 29%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                             | 69/242 [30:21<1:10:50, 24.57s/it] 29%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                           | 70/242 [30:49<1:13:22, 25.60s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.47', 'grad_norm': '0.232', 'learning_rate': '2.14e-05', 'ppl': '4.347', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.1', 'tokens/total': 1134592, 'tokens/trainable': 758895, 'epoch': '3.044'}
 29%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                                           | 70/242 [30:49<1:13:22, 25.60s/it] 29%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                          | 71/242 [31:16<1:13:46, 25.89s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.472', 'grad_norm': '0.1444', 'learning_rate': '2.128e-05', 'ppl': '4.36', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.7', 'tokens/total': 1150976, 'tokens/trainable': 770000, 'epoch': '3.088'}
 29%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                          | 71/242 [31:16<1:13:46, 25.89s/it] 30%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                        | 72/242 [31:42<1:13:55, 26.09s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.45', 'grad_norm': '0.1532', 'learning_rate': '2.116e-05', 'ppl': '4.262', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 1167360, 'tokens/trainable': 780962, 'epoch': '3.132'}
 30%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                        | 72/242 [31:42<1:13:55, 26.09s/it] 30%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                      | 73/242 [32:09<1:13:52, 26.23s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.477', 'grad_norm': '0.2559', 'learning_rate': '2.103e-05', 'ppl': '4.379', 'memory/max_active (GiB)': '64.63', 'memory/max_allocated (GiB)': '64.63', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '109.4', 'tokens/total': 1183744, 'tokens/trainable': 792247, 'epoch': '3.176'}
 30%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                      | 73/242 [32:09<1:13:52, 26.23s/it] 31%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                     | 74/242 [32:36<1:13:43, 26.33s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.421', 'grad_norm': '0.1782', 'learning_rate': '2.091e-05', 'ppl': '4.139', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.2', 'tokens/total': 1200128, 'tokens/trainable': 803475, 'epoch': '3.22'}
 31%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                                     | 74/242 [32:36<1:13:43, 26.33s/it] 31%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                   | 75/242 [33:02<1:13:28, 26.40s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.484', 'grad_norm': '0.1649', 'learning_rate': '2.078e-05', 'ppl': '4.412', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.73', 'tokens/total': 1216512, 'tokens/trainable': 814112, 'epoch': '3.264'}
 31%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                   | 75/242 [33:02<1:13:28, 26.40s/it] 31%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                 | 76/242 [33:29<1:13:10, 26.45s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.445', 'grad_norm': '0.2417', 'learning_rate': '2.065e-05', 'ppl': '4.243', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.2', 'tokens/total': 1232896, 'tokens/trainable': 825192, 'epoch': '3.308'}
 31%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                 | 76/242 [33:29<1:13:10, 26.45s/it] 32%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                | 77/242 [33:55<1:12:49, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.402', 'grad_norm': '0.2866', 'learning_rate': '2.052e-05', 'ppl': '4.064', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.4', 'tokens/total': 1249280, 'tokens/trainable': 836257, 'epoch': '3.352'}
 32%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                                | 77/242 [33:55<1:12:49, 26.48s/it] 32%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                              | 78/242 [34:22<1:12:27, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.443', 'grad_norm': '0.1874', 'learning_rate': '2.039e-05', 'ppl': '4.233', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.1', 'tokens/total': 1265664, 'tokens/trainable': 847342, 'epoch': '3.396'}
 32%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                              | 78/242 [34:22<1:12:27, 26.51s/it] 33%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                            | 79/242 [34:48<1:12:01, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.464', 'grad_norm': '0.1523', 'learning_rate': '2.026e-05', 'ppl': '4.325', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 1282048, 'tokens/trainable': 858319, 'epoch': '3.44'}
 33%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                            | 79/242 [34:48<1:12:01, 26.51s/it] 33%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                           | 80/242 [35:15<1:11:34, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.37', 'grad_norm': '0.2067', 'learning_rate': '2.012e-05', 'ppl': '3.937', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 1298432, 'tokens/trainable': 868733, 'epoch': '3.484'}
 33%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                           | 80/242 [35:15<1:11:34, 26.51s/it] 33%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                         | 81/242 [35:41<1:11:07, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.473', 'grad_norm': '0.1942', 'learning_rate': '1.998e-05', 'ppl': '4.361', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.3', 'tokens/total': 1314816, 'tokens/trainable': 879039, 'epoch': '3.527'}
 33%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                         | 81/242 [35:41<1:11:07, 26.51s/it] 34%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                       | 82/242 [36:08<1:10:40, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.45', 'grad_norm': '0.1106', 'learning_rate': '1.985e-05', 'ppl': '4.263', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 1331200, 'tokens/trainable': 889627, 'epoch': '3.571'}
 34%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                       | 82/242 [36:08<1:10:40, 26.50s/it] 34%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                      | 83/242 [36:34<1:10:13, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.439', 'grad_norm': '0.1371', 'learning_rate': '1.971e-05', 'ppl': '4.218', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.09', 'tokens/total': 1347584, 'tokens/trainable': 900245, 'epoch': '3.615'}
 34%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                      | 83/242 [36:34<1:10:13, 26.50s/it] 35%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                    | 84/242 [37:01<1:09:46, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.434', 'grad_norm': '0.2198', 'learning_rate': '1.957e-05', 'ppl': '4.196', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '98.02', 'tokens/total': 1363968, 'tokens/trainable': 910780, 'epoch': '3.659'}
 35%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                    | 84/242 [37:01<1:09:46, 26.50s/it] 35%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                  | 85/242 [37:27<1:09:20, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.437', 'grad_norm': '0.1317', 'learning_rate': '1.943e-05', 'ppl': '4.209', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.9', 'tokens/total': 1380352, 'tokens/trainable': 921944, 'epoch': '3.703'}
 35%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                  | 85/242 [37:27<1:09:20, 26.50s/it] 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                 | 86/242 [37:54<1:08:54, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.443', 'grad_norm': '0.1254', 'learning_rate': '1.928e-05', 'ppl': '4.234', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.1', 'tokens/total': 1396736, 'tokens/trainable': 933017, 'epoch': '3.747'}
 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                 | 86/242 [37:54<1:08:54, 26.50s/it] 36%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                               | 87/242 [38:20<1:08:28, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.473', 'grad_norm': '0.1218', 'learning_rate': '1.914e-05', 'ppl': '4.363', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '101', 'tokens/total': 1413120, 'tokens/trainable': 943866, 'epoch': '3.791'}
 36%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                               | 87/242 [38:20<1:08:28, 26.51s/it] 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                              | 88/242 [38:47<1:08:03, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.451', 'grad_norm': '0.1211', 'learning_rate': '1.899e-05', 'ppl': '4.266', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '98.46', 'tokens/total': 1429504, 'tokens/trainable': 955033, 'epoch': '3.835'}
 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                              | 88/242 [38:47<1:08:03, 26.52s/it] 37%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                            | 89/242 [39:13<1:07:36, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.446', 'grad_norm': '0.1203', 'learning_rate': '1.885e-05', 'ppl': '4.247', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '101.9', 'tokens/total': 1445888, 'tokens/trainable': 966322, 'epoch': '3.879'}
 37%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                            | 89/242 [39:13<1:07:36, 26.52s/it] 37%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                          | 90/242 [39:40<1:07:10, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.382', 'grad_norm': '0.1337', 'learning_rate': '1.87e-05', 'ppl': '3.982', 'memory/max_active (GiB)': '64.63', 'memory/max_allocated (GiB)': '64.63', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.3', 'tokens/total': 1462272, 'tokens/trainable': 976963, 'epoch': '3.923'}
 37%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                          | 90/242 [39:40<1:07:10, 26.52s/it] 38%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                         | 91/242 [40:06<1:06:44, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.38', 'grad_norm': '0.3956', 'learning_rate': '1.855e-05', 'ppl': '3.975', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 1478656, 'tokens/trainable': 987918, 'epoch': '3.967'}
 38%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                         | 91/242 [40:07<1:06:44, 26.52s/it] 38%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                       | 92/242 [40:26<1:01:23, 24.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.405', 'grad_norm': '0.1379', 'learning_rate': '1.84e-05', 'ppl': '4.074', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '140.6', 'tokens/total': 1490944, 'tokens/trainable': 996143, 'epoch': '4'}
 38%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                       | 92/242 [40:26<1:01:23, 24.56s/it] 38%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                     | 93/242 [40:55<1:03:40, 25.64s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.179', 'grad_norm': '0.2973', 'learning_rate': '1.825e-05', 'ppl': '3.25', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 1507328, 'tokens/trainable': 1006982, 'epoch': '4.044'}
 38%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                     | 93/242 [40:55<1:03:40, 25.64s/it] 39%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                    | 94/242 [41:21<1:03:55, 25.91s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.054', 'grad_norm': '0.2283', 'learning_rate': '1.81e-05', 'ppl': '2.868', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.7', 'tokens/total': 1523712, 'tokens/trainable': 1018109, 'epoch': '4.088'}
 39%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                    | 94/242 [41:21<1:03:55, 25.91s/it] 39%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                  | 95/242 [41:48<1:03:56, 26.10s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.098', 'grad_norm': '0.2046', 'learning_rate': '1.795e-05', 'ppl': '2.998', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 1540096, 'tokens/trainable': 1029115, 'epoch': '4.132'}
 39%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                  | 95/242 [41:48<1:03:56, 26.10s/it] 40%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                | 96/242 [42:14<1:03:48, 26.23s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.055', 'grad_norm': '0.2824', 'learning_rate': '1.779e-05', 'ppl': '2.873', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.6', 'tokens/total': 1556480, 'tokens/trainable': 1040121, 'epoch': '4.176'}
 40%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                | 96/242 [42:14<1:03:48, 26.23s/it] 40%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                               | 97/242 [42:41<1:03:35, 26.31s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.039', 'grad_norm': '0.1898', 'learning_rate': '1.764e-05', 'ppl': '2.827', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '103', 'tokens/total': 1572864, 'tokens/trainable': 1050616, 'epoch': '4.22'}
 40%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                               | 97/242 [42:41<1:03:35, 26.31s/it] 40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                             | 98/242 [43:07<1:03:17, 26.37s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.028', 'grad_norm': '0.1943', 'learning_rate': '1.748e-05', 'ppl': '2.795', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '99.74', 'tokens/total': 1589248, 'tokens/trainable': 1061467, 'epoch': '4.264'}
 40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                             | 98/242 [43:07<1:03:17, 26.37s/it] 41%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                           | 99/242 [43:34<1:02:58, 26.42s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.079', 'grad_norm': '0.4283', 'learning_rate': '1.732e-05', 'ppl': '2.942', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 1605632, 'tokens/trainable': 1072313, 'epoch': '4.308'}
 41%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                           | 99/242 [43:34<1:02:58, 26.42s/it] 41%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                         | 100/242 [44:00<1:02:36, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.086', 'grad_norm': '0.2085', 'learning_rate': '1.716e-05', 'ppl': '2.962', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '99.67', 'tokens/total': 1622016, 'tokens/trainable': 1083233, 'epoch': '4.352'}
 41%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                         | 100/242 [44:00<1:02:36, 26.46s/it] 42%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                        | 101/242 [44:27<1:02:13, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.079', 'grad_norm': '0.255', 'learning_rate': '1.701e-05', 'ppl': '2.942', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.4', 'tokens/total': 1638400, 'tokens/trainable': 1094462, 'epoch': '4.396'}
 42%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                        | 101/242 [44:27<1:02:13, 26.48s/it] 42%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                      | 102/242 [44:53<1:01:48, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.002', 'grad_norm': '4.584', 'learning_rate': '1.685e-05', 'ppl': '2.725', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.8', 'tokens/total': 1654784, 'tokens/trainable': 1105145, 'epoch': '4.44'}
 42%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                      | 102/242 [44:53<1:01:48, 26.49s/it] 43%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                    | 103/242 [45:20<1:01:22, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.045', 'grad_norm': '0.18', 'learning_rate': '1.669e-05', 'ppl': '2.845', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.7', 'tokens/total': 1671168, 'tokens/trainable': 1115999, 'epoch': '4.484'}
 43%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                    | 103/242 [45:20<1:01:22, 26.50s/it] 43%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                   | 104/242 [45:46<1:00:58, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.115', 'grad_norm': '0.3975', 'learning_rate': '1.652e-05', 'ppl': '3.051', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '112.8', 'tokens/total': 1687552, 'tokens/trainable': 1127507, 'epoch': '4.527'}
 43%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                   | 104/242 [45:46<1:00:58, 26.51s/it] 43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                 | 105/242 [46:13<1:00:31, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.053', 'grad_norm': '0.1848', 'learning_rate': '1.636e-05', 'ppl': '2.867', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 1703936, 'tokens/trainable': 1138986, 'epoch': '4.571'}
 43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                 | 105/242 [46:13<1:00:31, 26.51s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                               | 106/242 [46:39<1:00:05, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.014', 'grad_norm': '0.1749', 'learning_rate': '1.62e-05', 'ppl': '2.757', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.9', 'tokens/total': 1720320, 'tokens/trainable': 1150386, 'epoch': '4.615'}
 44%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                               | 106/242 [46:39<1:00:05, 26.51s/it] 44%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                               | 107/242 [47:06<59:39, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.12', 'grad_norm': '0.1478', 'learning_rate': '1.604e-05', 'ppl': '3.066', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '109.2', 'tokens/total': 1736704, 'tokens/trainable': 1161527, 'epoch': '4.659'}
 44%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                               | 107/242 [47:06<59:39, 26.51s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                             | 108/242 [47:32<59:13, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.007', 'grad_norm': '0.1431', 'learning_rate': '1.587e-05', 'ppl': '2.736', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.21', 'tokens/total': 1753088, 'tokens/trainable': 1172449, 'epoch': '4.703'}
 45%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                             | 108/242 [47:33<59:13, 26.52s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                           | 109/242 [47:59<58:46, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.043', 'grad_norm': '0.1831', 'learning_rate': '1.571e-05', 'ppl': '2.838', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 1769472, 'tokens/trainable': 1183378, 'epoch': '4.747'}
 45%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                           | 109/242 [47:59<58:46, 26.51s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                          | 110/242 [48:26<58:19, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.092', 'grad_norm': '2.04', 'learning_rate': '1.554e-05', 'ppl': '2.98', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '93.71', 'tokens/total': 1785856, 'tokens/trainable': 1193848, 'epoch': '4.791'}
 45%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                          | 110/242 [48:26<58:19, 26.51s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                        | 111/242 [48:52<57:52, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.147', 'grad_norm': '0.2025', 'learning_rate': '1.538e-05', 'ppl': '3.148', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '92.5', 'tokens/total': 1802240, 'tokens/trainable': 1204537, 'epoch': '4.835'}
 46%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                        | 111/242 [48:52<57:52, 26.51s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                       | 112/242 [49:19<57:26, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.042', 'grad_norm': '0.153', 'learning_rate': '1.521e-05', 'ppl': '2.835', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.5', 'tokens/total': 1818624, 'tokens/trainable': 1215446, 'epoch': '4.879'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                       | 112/242 [49:19<57:26, 26.51s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                     | 113/242 [49:45<57:00, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.099', 'grad_norm': '0.206', 'learning_rate': '1.504e-05', 'ppl': '3', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.7', 'tokens/total': 1835008, 'tokens/trainable': 1226121, 'epoch': '4.923'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                     | 113/242 [49:45<57:00, 26.51s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                   | 114/242 [50:12<56:35, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.096', 'grad_norm': '0.1569', 'learning_rate': '1.488e-05', 'ppl': '2.991', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 1851392, 'tokens/trainable': 1237124, 'epoch': '4.967'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                   | 114/242 [50:12<56:35, 26.53s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                  | 115/242 [50:32<52:01, 24.58s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '1.09', 'grad_norm': '0.1629', 'learning_rate': '1.471e-05', 'ppl': '2.975', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '148', 'tokens/total': 1863680, 'tokens/trainable': 1245780, 'epoch': '5'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                  | 115/242 [50:32<52:01, 24.58s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                | 116/242 [51:00<53:44, 25.59s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.8383', 'grad_norm': '0.3687', 'learning_rate': '1.454e-05', 'ppl': '2.312', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.2', 'tokens/total': 1880064, 'tokens/trainable': 1256816, 'epoch': '5.044'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                | 116/242 [51:00<53:44, 25.59s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                              | 117/242 [51:26<53:54, 25.88s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7805', 'grad_norm': '0.3193', 'learning_rate': '1.437e-05', 'ppl': '2.183', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 1896448, 'tokens/trainable': 1267784, 'epoch': '5.088'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                              | 117/242 [51:26<53:54, 25.88s/it] 49%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                             | 118/242 [51:53<53:53, 26.08s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.746', 'grad_norm': '0.1969', 'learning_rate': '1.42e-05', 'ppl': '2.109', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '98.43', 'tokens/total': 1912832, 'tokens/trainable': 1278876, 'epoch': '5.132'}
 49%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                             | 118/242 [51:53<53:53, 26.08s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                           | 119/242 [52:19<53:45, 26.23s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7531', 'grad_norm': '0.4715', 'learning_rate': '1.403e-05', 'ppl': '2.124', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105', 'tokens/total': 1929216, 'tokens/trainable': 1289898, 'epoch': '5.176'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                           | 119/242 [52:19<53:45, 26.23s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                         | 120/242 [52:46<53:31, 26.33s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7271', 'grad_norm': '0.3144', 'learning_rate': '1.386e-05', 'ppl': '2.069', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.5', 'tokens/total': 1945600, 'tokens/trainable': 1301107, 'epoch': '5.22'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                         | 120/242 [52:46<53:31, 26.33s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                        | 121/242 [53:12<53:14, 26.40s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7119', 'grad_norm': '0.246', 'learning_rate': '1.369e-05', 'ppl': '2.038', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 1961984, 'tokens/trainable': 1312272, 'epoch': '5.264'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                        | 121/242 [53:12<53:14, 26.40s/it] 50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                      | 122/242 [53:39<52:52, 26.44s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.761', 'grad_norm': '0.2445', 'learning_rate': '1.352e-05', 'ppl': '2.14', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105.6', 'tokens/total': 1978368, 'tokens/trainable': 1323350, 'epoch': '5.308'}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                      | 122/242 [53:39<52:52, 26.44s/it] 51%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                    | 123/242 [54:05<52:30, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7372', 'grad_norm': '0.2063', 'learning_rate': '1.335e-05', 'ppl': '2.09', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105.4', 'tokens/total': 1994752, 'tokens/trainable': 1334153, 'epoch': '5.352'}
 51%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                    | 123/242 [54:05<52:30, 26.47s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                   | 124/242 [54:32<52:06, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6955', 'grad_norm': '0.2219', 'learning_rate': '1.318e-05', 'ppl': '2.005', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.2', 'tokens/total': 2011136, 'tokens/trainable': 1345198, 'epoch': '5.396'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                   | 124/242 [54:32<52:06, 26.49s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                 | 125/242 [54:59<51:40, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7114', 'grad_norm': '0.2242', 'learning_rate': '1.301e-05', 'ppl': '2.037', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.1', 'tokens/total': 2027520, 'tokens/trainable': 1356150, 'epoch': '5.44'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                 | 125/242 [54:59<51:40, 26.50s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                               | 126/242 [55:25<51:16, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6949', 'grad_norm': '0.3086', 'learning_rate': '1.284e-05', 'ppl': '2.003', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 2043904, 'tokens/trainable': 1367204, 'epoch': '5.484'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                               | 126/242 [55:25<51:16, 26.52s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                              | 127/242 [55:52<50:51, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7284', 'grad_norm': '0.3071', 'learning_rate': '1.267e-05', 'ppl': '2.072', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102', 'tokens/total': 2060288, 'tokens/trainable': 1377698, 'epoch': '5.527'}
 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                              | 127/242 [55:52<50:51, 26.53s/it] 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                            | 128/242 [56:18<50:25, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6839', 'grad_norm': '0.2157', 'learning_rate': '1.25e-05', 'ppl': '1.982', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.8', 'tokens/total': 2076672, 'tokens/trainable': 1389025, 'epoch': '5.571'}
 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                            | 128/242 [56:18<50:25, 26.54s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                          | 129/242 [56:45<49:58, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6826', 'grad_norm': '0.248', 'learning_rate': '1.233e-05', 'ppl': '1.979', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '96.44', 'tokens/total': 2093056, 'tokens/trainable': 1399635, 'epoch': '5.615'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                          | 129/242 [56:45<49:58, 26.53s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                         | 130/242 [57:11<49:30, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6872', 'grad_norm': '0.5028', 'learning_rate': '1.216e-05', 'ppl': '1.988', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.7', 'tokens/total': 2109440, 'tokens/trainable': 1410755, 'epoch': '5.659'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                         | 130/242 [57:11<49:30, 26.52s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                       | 131/242 [57:38<49:03, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7121', 'grad_norm': '0.2048', 'learning_rate': '1.199e-05', 'ppl': '2.038', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 2125824, 'tokens/trainable': 1421308, 'epoch': '5.703'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                       | 131/242 [57:38<49:03, 26.51s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                     | 132/242 [58:04<48:37, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6909', 'grad_norm': '0.2015', 'learning_rate': '1.182e-05', 'ppl': '1.995', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.1', 'tokens/total': 2142208, 'tokens/trainable': 1432118, 'epoch': '5.747'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                     | 132/242 [58:04<48:37, 26.52s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                    | 133/242 [58:31<48:10, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6724', 'grad_norm': '0.2809', 'learning_rate': '1.165e-05', 'ppl': '1.959', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.4', 'tokens/total': 2158592, 'tokens/trainable': 1443048, 'epoch': '5.791'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                    | 133/242 [58:31<48:10, 26.52s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                  | 134/242 [58:57<47:43, 26.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.6875', 'grad_norm': '0.2079', 'learning_rate': '1.148e-05', 'ppl': '1.989', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.9', 'tokens/total': 2174976, 'tokens/trainable': 1453777, 'epoch': '5.835'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                  | 134/242 [58:57<47:43, 26.52s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                | 135/242 [59:24<47:16, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7484', 'grad_norm': '0.3611', 'learning_rate': '1.131e-05', 'ppl': '2.114', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.13', 'tokens/total': 2191360, 'tokens/trainable': 1464590, 'epoch': '5.879'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                | 135/242 [59:24<47:16, 26.51s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                               | 136/242 [59:50<46:49, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7196', 'grad_norm': '0.645', 'learning_rate': '1.114e-05', 'ppl': '2.054', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.4', 'tokens/total': 2207744, 'tokens/trainable': 1475333, 'epoch': '5.923'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                               | 136/242 [59:50<46:49, 26.51s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                            | 137/242 [1:00:17<46:22, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7125', 'grad_norm': '4.188', 'learning_rate': '1.097e-05', 'ppl': '2.039', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.72', 'tokens/total': 2224128, 'tokens/trainable': 1486031, 'epoch': '5.967'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                            | 137/242 [1:00:17<46:22, 26.50s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                           | 138/242 [1:00:37<42:31, 24.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.7655', 'grad_norm': '0.2723', 'learning_rate': '1.08e-05', 'ppl': '2.15', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '139.4', 'tokens/total': 2236416, 'tokens/trainable': 1493715, 'epoch': '6'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                           | 138/242 [1:00:37<42:31, 24.53s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                         | 139/242 [1:01:05<43:52, 25.55s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4499', 'grad_norm': '0.3383', 'learning_rate': '1.063e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '103.8', 'tokens/total': 2252800, 'tokens/trainable': 1504943, 'epoch': '6.044'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                         | 139/242 [1:01:05<43:52, 25.55s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                       | 140/242 [1:01:31<43:55, 25.84s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.462', 'grad_norm': '0.3111', 'learning_rate': '1.046e-05', 'ppl': '1.587', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.1', 'tokens/total': 2269184, 'tokens/trainable': 1516055, 'epoch': '6.088'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                       | 140/242 [1:01:31<43:55, 25.84s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                      | 141/242 [1:01:58<43:50, 26.05s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4183', 'grad_norm': '0.2581', 'learning_rate': '1.029e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '102.2', 'tokens/total': 2285568, 'tokens/trainable': 1526799, 'epoch': '6.132'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                      | 141/242 [1:01:58<43:50, 26.05s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                    | 142/242 [1:02:24<43:37, 26.18s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4668', 'grad_norm': '0.4511', 'learning_rate': '1.012e-05', 'ppl': '1.595', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.3', 'tokens/total': 2301952, 'tokens/trainable': 1537833, 'epoch': '6.176'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                    | 142/242 [1:02:24<43:37, 26.18s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                  | 143/242 [1:02:51<43:21, 26.28s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4344', 'grad_norm': '1.089', 'learning_rate': '9.957e-06', 'ppl': '1.544', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.5', 'tokens/total': 2318336, 'tokens/trainable': 1549158, 'epoch': '6.22'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                  | 143/242 [1:02:51<43:21, 26.28s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                 | 144/242 [1:03:17<43:02, 26.35s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4104', 'grad_norm': '0.7761', 'learning_rate': '9.79e-06', 'ppl': '1.507', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105.8', 'tokens/total': 2334720, 'tokens/trainable': 1560641, 'epoch': '6.264'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                 | 144/242 [1:03:17<43:02, 26.35s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                               | 145/242 [1:03:44<42:40, 26.40s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4165', 'grad_norm': '0.6537', 'learning_rate': '9.623e-06', 'ppl': '1.517', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 2351104, 'tokens/trainable': 1571636, 'epoch': '6.308'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                               | 145/242 [1:03:44<42:40, 26.40s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                              | 146/242 [1:04:10<42:17, 26.44s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4192', 'grad_norm': '0.2966', 'learning_rate': '9.458e-06', 'ppl': '1.521', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 2367488, 'tokens/trainable': 1582966, 'epoch': '6.352'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                              | 146/242 [1:04:10<42:17, 26.44s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                            | 147/242 [1:04:37<41:53, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4665', 'grad_norm': '0.9001', 'learning_rate': '9.292e-06', 'ppl': '1.594', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 2383872, 'tokens/trainable': 1593742, 'epoch': '6.396'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                            | 147/242 [1:04:37<41:53, 26.46s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                          | 148/242 [1:05:03<41:28, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4031', 'grad_norm': '0.2423', 'learning_rate': '9.128e-06', 'ppl': '1.496', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100', 'tokens/total': 2400256, 'tokens/trainable': 1604750, 'epoch': '6.44'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                          | 148/242 [1:05:03<41:28, 26.48s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                         | 149/242 [1:05:30<41:03, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4209', 'grad_norm': '0.3619', 'learning_rate': '8.963e-06', 'ppl': '1.523', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 2416640, 'tokens/trainable': 1615871, 'epoch': '6.484'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                         | 149/242 [1:05:30<41:03, 26.48s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                       | 150/242 [1:05:56<40:37, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4268', 'grad_norm': '0.2574', 'learning_rate': '8.8e-06', 'ppl': '1.532', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '93.89', 'tokens/total': 2433024, 'tokens/trainable': 1626536, 'epoch': '6.527'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                       | 150/242 [1:05:56<40:37, 26.49s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                     | 151/242 [1:06:23<40:10, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4154', 'grad_norm': '0.2806', 'learning_rate': '8.637e-06', 'ppl': '1.515', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.2', 'tokens/total': 2449408, 'tokens/trainable': 1637026, 'epoch': '6.571'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                     | 151/242 [1:06:23<40:10, 26.49s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                    | 152/242 [1:06:49<39:44, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4097', 'grad_norm': '0.3365', 'learning_rate': '8.475e-06', 'ppl': '1.506', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '100.2', 'tokens/total': 2465792, 'tokens/trainable': 1647688, 'epoch': '6.615'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                    | 152/242 [1:06:49<39:44, 26.49s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                  | 153/242 [1:07:16<39:17, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.3648', 'grad_norm': '0.286', 'learning_rate': '8.314e-06', 'ppl': '1.44', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '91.33', 'tokens/total': 2482176, 'tokens/trainable': 1658504, 'epoch': '6.659'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                  | 153/242 [1:07:16<39:17, 26.49s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                | 154/242 [1:07:42<38:50, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.3935', 'grad_norm': '0.3063', 'learning_rate': '8.154e-06', 'ppl': '1.482', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '97.61', 'tokens/total': 2498560, 'tokens/trainable': 1668861, 'epoch': '6.703'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                | 154/242 [1:07:42<38:50, 26.49s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                               | 155/242 [1:08:09<38:24, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4277', 'grad_norm': '0.2728', 'learning_rate': '7.994e-06', 'ppl': '1.534', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '110.2', 'tokens/total': 2514944, 'tokens/trainable': 1679343, 'epoch': '6.747'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                               | 155/242 [1:08:09<38:24, 26.49s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                             | 156/242 [1:08:35<37:57, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4166', 'grad_norm': '1.109', 'learning_rate': '7.835e-06', 'ppl': '1.517', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '103.5', 'tokens/total': 2531328, 'tokens/trainable': 1689913, 'epoch': '6.791'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                             | 156/242 [1:08:35<37:57, 26.49s/it] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                           | 157/242 [1:09:02<37:32, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.3972', 'grad_norm': '0.2504', 'learning_rate': '7.677e-06', 'ppl': '1.488', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '96.8', 'tokens/total': 2547712, 'tokens/trainable': 1700708, 'epoch': '6.835'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                           | 157/242 [1:09:02<37:32, 26.50s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                          | 158/242 [1:09:28<37:06, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4518', 'grad_norm': '0.2606', 'learning_rate': '7.52e-06', 'ppl': '1.571', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 2564096, 'tokens/trainable': 1711443, 'epoch': '6.879'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                          | 158/242 [1:09:28<37:06, 26.50s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                        | 159/242 [1:09:55<36:39, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4068', 'grad_norm': '0.3', 'learning_rate': '7.364e-06', 'ppl': '1.502', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '96.52', 'tokens/total': 2580480, 'tokens/trainable': 1722090, 'epoch': '6.923'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                        | 159/242 [1:09:55<36:39, 26.50s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                      | 160/242 [1:10:21<36:14, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.3909', 'grad_norm': '0.2364', 'learning_rate': '7.209e-06', 'ppl': '1.478', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '105.1', 'tokens/total': 2596864, 'tokens/trainable': 1732868, 'epoch': '6.967'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                      | 160/242 [1:10:21<36:14, 26.51s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                     | 161/242 [1:10:41<33:09, 24.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.4135', 'grad_norm': '42.07', 'learning_rate': '7.054e-06', 'ppl': '1.512', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '148.7', 'tokens/total': 2609152, 'tokens/trainable': 1741509, 'epoch': '7'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                     | 161/242 [1:10:41<33:09, 24.56s/it] 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                   | 162/242 [1:11:09<34:07, 25.59s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2476', 'grad_norm': '0.3178', 'learning_rate': '6.901e-06', 'ppl': '1.281', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '106.7', 'tokens/total': 2625536, 'tokens/trainable': 1752592, 'epoch': '7.044'}
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                   | 162/242 [1:11:09<34:07, 25.59s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                  | 163/242 [1:11:36<34:03, 25.87s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2201', 'grad_norm': '0.2648', 'learning_rate': '6.749e-06', 'ppl': '1.246', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 2641920, 'tokens/trainable': 1763380, 'epoch': '7.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                  | 163/242 [1:11:36<34:03, 25.87s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                | 164/242 [1:12:02<33:53, 26.07s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2344', 'grad_norm': '0.2192', 'learning_rate': '6.598e-06', 'ppl': '1.264', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 2658304, 'tokens/trainable': 1774619, 'epoch': '7.132'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                | 164/242 [1:12:02<33:53, 26.07s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                              | 165/242 [1:12:29<33:38, 26.21s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2173', 'grad_norm': '0.2133', 'learning_rate': '6.448e-06', 'ppl': '1.243', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '107.6', 'tokens/total': 2674688, 'tokens/trainable': 1785754, 'epoch': '7.176'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                              | 165/242 [1:12:29<33:38, 26.21s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                             | 166/242 [1:12:55<33:18, 26.30s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2172', 'grad_norm': '0.424', 'learning_rate': '6.299e-06', 'ppl': '1.243', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.37', 'tokens/train_per_sec_per_gpu': '104.8', 'tokens/total': 2691072, 'tokens/trainable': 1796757, 'epoch': '7.22'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                             | 166/242 [1:12:55<33:18, 26.30s/it] 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                           | 167/242 [1:13:22<32:57, 26.36s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2001', 'grad_norm': '0.2853', 'learning_rate': '6.152e-06', 'ppl': '1.222', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 2707456, 'tokens/trainable': 1808109, 'epoch': '7.264'}
 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                           | 167/242 [1:13:22<32:57, 26.36s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                         | 168/242 [1:13:48<32:34, 26.41s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2348', 'grad_norm': '0.2743', 'learning_rate': '6.005e-06', 'ppl': '1.265', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.8', 'tokens/total': 2723840, 'tokens/trainable': 1819019, 'epoch': '7.308'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                         | 168/242 [1:13:48<32:34, 26.41s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 169/242 [1:14:15<32:10, 26.45s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.228', 'grad_norm': '0.4179', 'learning_rate': '5.86e-06', 'ppl': '1.256', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '110.8', 'tokens/total': 2740224, 'tokens/trainable': 1830476, 'epoch': '7.352'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 169/242 [1:14:15<32:10, 26.45s/it] 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 170/242 [1:14:41<31:45, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1905', 'grad_norm': '0.2388', 'learning_rate': '5.716e-06', 'ppl': '1.21', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '101.4', 'tokens/total': 2756608, 'tokens/trainable': 1841456, 'epoch': '7.396'}
 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 170/242 [1:14:41<31:45, 26.46s/it] 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 171/242 [1:15:08<31:18, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.202', 'grad_norm': '0.2414', 'learning_rate': '5.573e-06', 'ppl': '1.224', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '106', 'tokens/total': 2772992, 'tokens/trainable': 1852178, 'epoch': '7.44'}
 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 171/242 [1:15:08<31:18, 26.46s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 172/242 [1:15:34<30:52, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2015', 'grad_norm': '0.2207', 'learning_rate': '5.432e-06', 'ppl': '1.223', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '90.02', 'tokens/total': 2789376, 'tokens/trainable': 1862629, 'epoch': '7.484'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 172/242 [1:15:34<30:52, 26.46s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 173/242 [1:16:01<30:26, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2232', 'grad_norm': '0.3557', 'learning_rate': '5.291e-06', 'ppl': '1.25', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '97.09', 'tokens/total': 2805760, 'tokens/trainable': 1873517, 'epoch': '7.527'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 173/242 [1:16:01<30:26, 26.47s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 174/242 [1:16:27<30:00, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1901', 'grad_norm': '0.2697', 'learning_rate': '5.153e-06', 'ppl': '1.209', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '105.5', 'tokens/total': 2822144, 'tokens/trainable': 1884480, 'epoch': '7.571'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 174/242 [1:16:27<30:00, 26.48s/it] 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 175/242 [1:16:54<29:34, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2168', 'grad_norm': '0.8351', 'learning_rate': '5.015e-06', 'ppl': '1.242', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '97.97', 'tokens/total': 2838528, 'tokens/trainable': 1895206, 'epoch': '7.615'}
 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 175/242 [1:16:54<29:34, 26.49s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 176/242 [1:17:20<29:08, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2088', 'grad_norm': '0.2571', 'learning_rate': '4.879e-06', 'ppl': '1.232', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 2854912, 'tokens/trainable': 1906209, 'epoch': '7.659'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 176/242 [1:17:20<29:08, 26.49s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 177/242 [1:17:47<28:41, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2047', 'grad_norm': '0.2407', 'learning_rate': '4.745e-06', 'ppl': '1.227', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '109.7', 'tokens/total': 2871296, 'tokens/trainable': 1917319, 'epoch': '7.703'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 177/242 [1:17:47<28:41, 26.49s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 178/242 [1:18:13<28:15, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2207', 'grad_norm': '0.2253', 'learning_rate': '4.611e-06', 'ppl': '1.247', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '103.9', 'tokens/total': 2887680, 'tokens/trainable': 1928568, 'epoch': '7.747'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 178/242 [1:18:13<28:15, 26.49s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 179/242 [1:18:40<27:48, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1961', 'grad_norm': '0.221', 'learning_rate': '4.48e-06', 'ppl': '1.217', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '91.18', 'tokens/total': 2904064, 'tokens/trainable': 1939160, 'epoch': '7.791'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 179/242 [1:18:40<27:48, 26.48s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 180/242 [1:19:06<27:21, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1962', 'grad_norm': '0.217', 'learning_rate': '4.349e-06', 'ppl': '1.217', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.3', 'tokens/total': 2920448, 'tokens/trainable': 1949592, 'epoch': '7.835'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 180/242 [1:19:06<27:21, 26.47s/it] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 181/242 [1:19:33<26:54, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2221', 'grad_norm': '0.3784', 'learning_rate': '4.221e-06', 'ppl': '1.249', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 2936832, 'tokens/trainable': 1960146, 'epoch': '7.879'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 181/242 [1:19:33<26:54, 26.46s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 182/242 [1:19:59<26:28, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.196', 'grad_norm': '0.2209', 'learning_rate': '4.094e-06', 'ppl': '1.217', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '102.3', 'tokens/total': 2953216, 'tokens/trainable': 1971039, 'epoch': '7.923'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 182/242 [1:19:59<26:28, 26.47s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 183/242 [1:20:26<26:01, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2359', 'grad_norm': '0.2514', 'learning_rate': '3.968e-06', 'ppl': '1.266', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '98.52', 'tokens/total': 2969600, 'tokens/trainable': 1981577, 'epoch': '7.967'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 183/242 [1:20:26<26:01, 26.47s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 184/242 [1:20:46<23:41, 24.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.2288', 'grad_norm': '0.264', 'learning_rate': '3.844e-06', 'ppl': '1.257', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '131.2', 'tokens/total': 2981888, 'tokens/trainable': 1989486, 'epoch': '8'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 184/242 [1:20:46<23:41, 24.51s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 185/242 [1:21:14<24:15, 25.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1209', 'grad_norm': '0.1936', 'learning_rate': '3.722e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '102.9', 'tokens/total': 2998272, 'tokens/trainable': 2000622, 'epoch': '8.044'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 185/242 [1:21:14<24:15, 25.54s/it] 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 186/242 [1:21:40<24:05, 25.82s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1142', 'grad_norm': '0.1893', 'learning_rate': '3.601e-06', 'ppl': '1.121', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 3014656, 'tokens/trainable': 2011774, 'epoch': '8.088'}
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 186/242 [1:21:40<24:05, 25.82s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 187/242 [1:22:06<23:50, 26.02s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1056', 'grad_norm': '0.1745', 'learning_rate': '3.482e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '64.63', 'memory/max_allocated (GiB)': '64.63', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '103.2', 'tokens/total': 3031040, 'tokens/trainable': 2022621, 'epoch': '8.132'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 187/242 [1:22:07<23:50, 26.02s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 188/242 [1:22:33<23:32, 26.17s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.1025', 'grad_norm': '0.1749', 'learning_rate': '3.365e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 3047424, 'tokens/trainable': 2033870, 'epoch': '8.176'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 188/242 [1:22:33<23:32, 26.17s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 189/242 [1:22:59<23:11, 26.26s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.09691', 'grad_norm': '0.2499', 'learning_rate': '3.249e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '97.38', 'tokens/total': 3063808, 'tokens/trainable': 2044823, 'epoch': '8.22'}
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 189/242 [1:23:00<23:11, 26.26s/it] 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 190/242 [1:23:26<22:49, 26.33s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.09358', 'grad_norm': '0.208', 'learning_rate': '3.135e-06', 'ppl': '1.098', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '106', 'tokens/total': 3080192, 'tokens/trainable': 2055744, 'epoch': '8.264'}
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 190/242 [1:23:26<22:49, 26.33s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 191/242 [1:23:52<22:25, 26.38s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08495', 'grad_norm': '0.1827', 'learning_rate': '3.023e-06', 'ppl': '1.089', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '105.4', 'tokens/total': 3096576, 'tokens/trainable': 2066568, 'epoch': '8.308'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 191/242 [1:23:53<22:25, 26.38s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 192/242 [1:24:19<22:00, 26.41s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08753', 'grad_norm': '0.1976', 'learning_rate': '2.912e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.6', 'tokens/total': 3112960, 'tokens/trainable': 2077760, 'epoch': '8.352'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 192/242 [1:24:19<22:00, 26.41s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 193/242 [1:24:45<21:35, 26.44s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.0979', 'grad_norm': '0.3241', 'learning_rate': '2.804e-06', 'ppl': '1.103', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.4', 'tokens/total': 3129344, 'tokens/trainable': 2088804, 'epoch': '8.396'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 193/242 [1:24:45<21:35, 26.44s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 194/242 [1:25:12<21:10, 26.46s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.06911', 'grad_norm': '0.1529', 'learning_rate': '2.697e-06', 'ppl': '1.072', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '105.6', 'tokens/total': 3145728, 'tokens/trainable': 2100025, 'epoch': '8.44'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 194/242 [1:25:12<21:10, 26.46s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 195/242 [1:25:38<20:44, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.07992', 'grad_norm': '0.1843', 'learning_rate': '2.592e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '97.98', 'tokens/total': 3162112, 'tokens/trainable': 2110347, 'epoch': '8.484'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 195/242 [1:25:38<20:44, 26.47s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 196/242 [1:26:05<20:17, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.0757', 'grad_norm': '0.186', 'learning_rate': '2.489e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '64.63', 'memory/max_allocated (GiB)': '64.63', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.6', 'tokens/total': 3178496, 'tokens/trainable': 2121342, 'epoch': '8.527'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 196/242 [1:26:05<20:17, 26.48s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 197/242 [1:26:31<19:51, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08635', 'grad_norm': '0.1648', 'learning_rate': '2.387e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '98.26', 'tokens/total': 3194880, 'tokens/trainable': 2132330, 'epoch': '8.571'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 197/242 [1:26:31<19:51, 26.49s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 198/242 [1:26:58<19:25, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.09255', 'grad_norm': '0.2116', 'learning_rate': '2.288e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 3211264, 'tokens/trainable': 2143813, 'epoch': '8.615'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 198/242 [1:26:58<19:25, 26.49s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 199/242 [1:27:24<18:58, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08609', 'grad_norm': '0.3673', 'learning_rate': '2.19e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '103.7', 'tokens/total': 3227648, 'tokens/trainable': 2154541, 'epoch': '8.659'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 199/242 [1:27:24<18:58, 26.49s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 200/242 [1:27:51<18:32, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08165', 'grad_norm': '0.1965', 'learning_rate': '2.095e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.6', 'tokens/total': 3244032, 'tokens/trainable': 2164848, 'epoch': '8.703'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 200/242 [1:27:51<18:32, 26.48s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 201/242 [1:28:17<18:06, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.09225', 'grad_norm': '0.5852', 'learning_rate': '2.001e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 3260416, 'tokens/trainable': 2175663, 'epoch': '8.747'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 201/242 [1:28:17<18:06, 26.49s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 202/242 [1:28:44<17:39, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08876', 'grad_norm': '0.1796', 'learning_rate': '1.909e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.9', 'tokens/total': 3276800, 'tokens/trainable': 2186828, 'epoch': '8.791'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 202/242 [1:28:44<17:39, 26.49s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 203/242 [1:29:10<17:13, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08665', 'grad_norm': '0.233', 'learning_rate': '1.82e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.3', 'tokens/total': 3293184, 'tokens/trainable': 2197788, 'epoch': '8.835'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 203/242 [1:29:10<17:13, 26.50s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 204/242 [1:29:37<16:46, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08208', 'grad_norm': '0.2867', 'learning_rate': '1.732e-06', 'ppl': '1.086', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.18', 'tokens/total': 3309568, 'tokens/trainable': 2208779, 'epoch': '8.879'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 204/242 [1:29:37<16:46, 26.49s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 205/242 [1:30:03<16:20, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08634', 'grad_norm': '0.1997', 'learning_rate': '1.646e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '98.68', 'tokens/total': 3325952, 'tokens/trainable': 2219084, 'epoch': '8.923'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 205/242 [1:30:03<16:20, 26.49s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 206/242 [1:30:30<15:53, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.08076', 'grad_norm': '9.82', 'learning_rate': '1.563e-06', 'ppl': '1.084', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '102.7', 'tokens/total': 3342336, 'tokens/trainable': 2229961, 'epoch': '8.967'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 206/242 [1:30:30<15:53, 26.49s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 207/242 [1:30:50<14:18, 24.52s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.06287', 'grad_norm': '0.1947', 'learning_rate': '1.481e-06', 'ppl': '1.065', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '126.7', 'tokens/total': 3354624, 'tokens/trainable': 2237741, 'epoch': '9'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 207/242 [1:30:50<14:18, 24.52s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 208/242 [1:31:18<14:28, 25.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.0568', 'grad_norm': '0.1199', 'learning_rate': '1.401e-06', 'ppl': '1.058', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 3371008, 'tokens/trainable': 2249119, 'epoch': '9.044'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 208/242 [1:31:18<14:28, 25.54s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 209/242 [1:31:44<14:12, 25.85s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04781', 'grad_norm': '0.131', 'learning_rate': '1.324e-06', 'ppl': '1.049', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '106.4', 'tokens/total': 3387392, 'tokens/trainable': 2260391, 'epoch': '9.088'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 209/242 [1:31:44<14:12, 25.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 210/242 [1:32:11<13:53, 26.04s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04088', 'grad_norm': '0.1361', 'learning_rate': '1.248e-06', 'ppl': '1.042', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 3403776, 'tokens/trainable': 2271507, 'epoch': '9.132'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 210/242 [1:32:11<13:53, 26.04s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 211/242 [1:32:37<13:31, 26.18s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04817', 'grad_norm': '0.124', 'learning_rate': '1.175e-06', 'ppl': '1.049', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '103.4', 'tokens/total': 3420160, 'tokens/trainable': 2282572, 'epoch': '9.176'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 211/242 [1:32:37<13:31, 26.18s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 212/242 [1:33:04<13:08, 26.28s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04661', 'grad_norm': '0.1146', 'learning_rate': '1.104e-06', 'ppl': '1.048', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '106.3', 'tokens/total': 3436544, 'tokens/trainable': 2293884, 'epoch': '9.22'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 212/242 [1:33:04<13:08, 26.28s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 213/242 [1:33:30<12:43, 26.34s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04383', 'grad_norm': '0.1586', 'learning_rate': '1.035e-06', 'ppl': '1.045', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '105.8', 'tokens/total': 3452928, 'tokens/trainable': 2304647, 'epoch': '9.264'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 213/242 [1:33:30<12:43, 26.34s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 214/242 [1:33:57<12:18, 26.38s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04379', 'grad_norm': '0.1338', 'learning_rate': '9.679e-07', 'ppl': '1.045', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '103', 'tokens/total': 3469312, 'tokens/trainable': 2315611, 'epoch': '9.308'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 214/242 [1:33:57<12:18, 26.38s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 215/242 [1:34:23<11:53, 26.41s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04436', 'grad_norm': '0.123', 'learning_rate': '9.031e-07', 'ppl': '1.045', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.1', 'tokens/total': 3485696, 'tokens/trainable': 2326791, 'epoch': '9.352'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 215/242 [1:34:23<11:53, 26.41s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 216/242 [1:34:50<11:27, 26.43s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04632', 'grad_norm': '0.1199', 'learning_rate': '8.405e-07', 'ppl': '1.047', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.5', 'tokens/total': 3502080, 'tokens/trainable': 2338019, 'epoch': '9.396'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 216/242 [1:34:50<11:27, 26.43s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 217/242 [1:35:16<11:01, 26.45s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.047', 'grad_norm': '0.1362', 'learning_rate': '7.8e-07', 'ppl': '1.048', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.3', 'tokens/total': 3518464, 'tokens/trainable': 2349163, 'epoch': '9.44'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 217/242 [1:35:16<11:01, 26.45s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 218/242 [1:35:43<10:35, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04525', 'grad_norm': '0.1468', 'learning_rate': '7.217e-07', 'ppl': '1.046', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '100.4', 'tokens/total': 3534848, 'tokens/trainable': 2360015, 'epoch': '9.484'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 218/242 [1:35:43<10:35, 26.47s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 219/242 [1:36:09<10:08, 26.47s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03828', 'grad_norm': '0.4156', 'learning_rate': '6.657e-07', 'ppl': '1.039', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '98.51', 'tokens/total': 3551232, 'tokens/trainable': 2370736, 'epoch': '9.527'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 219/242 [1:36:09<10:08, 26.47s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 220/242 [1:36:36<09:42, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04471', 'grad_norm': '0.2394', 'learning_rate': '6.118e-07', 'ppl': '1.046', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '104.2', 'tokens/total': 3567616, 'tokens/trainable': 2381783, 'epoch': '9.571'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 220/242 [1:36:36<09:42, 26.49s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 221/242 [1:37:02<09:16, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04301', 'grad_norm': '0.1714', 'learning_rate': '5.601e-07', 'ppl': '1.044', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.81', 'tokens/total': 3584000, 'tokens/trainable': 2392777, 'epoch': '9.615'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 221/242 [1:37:02<09:16, 26.50s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 222/242 [1:37:29<08:49, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04949', 'grad_norm': '0.144', 'learning_rate': '5.107e-07', 'ppl': '1.051', 'memory/max_active (GiB)': '64.68', 'memory/max_allocated (GiB)': '64.68', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '101', 'tokens/total': 3600384, 'tokens/trainable': 2403646, 'epoch': '9.659'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 222/242 [1:37:29<08:49, 26.49s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 223/242 [1:37:55<08:23, 26.49s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03992', 'grad_norm': '0.6863', 'learning_rate': '4.635e-07', 'ppl': '1.041', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '102.8', 'tokens/total': 3616768, 'tokens/trainable': 2414246, 'epoch': '9.703'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 223/242 [1:37:55<08:23, 26.49s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 224/242 [1:38:22<07:57, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04563', 'grad_norm': '0.1162', 'learning_rate': '4.186e-07', 'ppl': '1.047', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 3633152, 'tokens/trainable': 2424828, 'epoch': '9.747'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 224/242 [1:38:22<07:57, 26.50s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 225/242 [1:38:48<07:30, 26.50s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03843', 'grad_norm': '0.2382', 'learning_rate': '3.759e-07', 'ppl': '1.039', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '96.54', 'tokens/total': 3649536, 'tokens/trainable': 2435689, 'epoch': '9.791'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 225/242 [1:38:48<07:30, 26.50s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 226/242 [1:39:15<07:04, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04201', 'grad_norm': '0.2591', 'learning_rate': '3.355e-07', 'ppl': '1.043', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.19', 'tokens/total': 3665920, 'tokens/trainable': 2446209, 'epoch': '9.835'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 226/242 [1:39:15<07:04, 26.51s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 227/242 [1:39:41<06:37, 26.51s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.0337', 'grad_norm': '0.3837', 'learning_rate': '2.973e-07', 'ppl': '1.034', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 3682304, 'tokens/trainable': 2456963, 'epoch': '9.879'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 227/242 [1:39:41<06:37, 26.51s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 228/242 [1:40:08<06:11, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.04329', 'grad_norm': '0.1043', 'learning_rate': '2.614e-07', 'ppl': '1.044', 'memory/max_active (GiB)': '64.76', 'memory/max_allocated (GiB)': '64.76', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.45', 'tokens/total': 3698688, 'tokens/trainable': 2468113, 'epoch': '9.923'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 228/242 [1:40:08<06:11, 26.53s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 229/242 [1:40:34<05:44, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03725', 'grad_norm': '0.1196', 'learning_rate': '2.279e-07', 'ppl': '1.038', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 3715072, 'tokens/trainable': 2479373, 'epoch': '9.967'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 229/242 [1:40:34<05:44, 26.54s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 230/242 [1:40:54<04:54, 24.58s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03604', 'grad_norm': '0.1735', 'learning_rate': '1.965e-07', 'ppl': '1.037', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '123.3', 'tokens/total': 3727360, 'tokens/trainable': 2487084, 'epoch': '10'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 230/242 [1:40:55<04:54, 24.58s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 231/242 [1:41:23<04:41, 25.62s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03702', 'grad_norm': '0.1628', 'learning_rate': '1.675e-07', 'ppl': '1.038', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 3743744, 'tokens/trainable': 2498525, 'epoch': '10.04'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 231/242 [1:41:23<04:41, 25.62s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 232/242 [1:41:49<04:19, 25.92s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03328', 'grad_norm': '0.08659', 'learning_rate': '1.408e-07', 'ppl': '1.034', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '109.7', 'tokens/total': 3760128, 'tokens/trainable': 2509661, 'epoch': '10.09'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 232/242 [1:41:49<04:19, 25.92s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 233/242 [1:42:16<03:55, 26.14s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03694', 'grad_norm': '0.1155', 'learning_rate': '1.164e-07', 'ppl': '1.038', 'memory/max_active (GiB)': '64.72', 'memory/max_allocated (GiB)': '64.72', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '107.5', 'tokens/total': 3776512, 'tokens/trainable': 2520919, 'epoch': '10.13'}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 233/242 [1:42:16<03:55, 26.14s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 234/242 [1:42:42<03:30, 26.26s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03262', 'grad_norm': '0.08284', 'learning_rate': '9.433e-08', 'ppl': '1.033', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '102', 'tokens/total': 3792896, 'tokens/trainable': 2531851, 'epoch': '10.18'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 234/242 [1:42:42<03:30, 26.26s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 235/242 [1:43:09<03:04, 26.34s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03055', 'grad_norm': '0.09425', 'learning_rate': '7.455e-08', 'ppl': '1.031', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.11', 'tokens/total': 3809280, 'tokens/trainable': 2542483, 'epoch': '10.22'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 235/242 [1:43:09<03:04, 26.34s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 236/242 [1:43:35<02:38, 26.41s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03248', 'grad_norm': '0.1235', 'learning_rate': '5.709e-08', 'ppl': '1.033', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '98.8', 'tokens/total': 3825664, 'tokens/trainable': 2553625, 'epoch': '10.26'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 236/242 [1:43:36<02:38, 26.41s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 237/242 [1:44:02<02:12, 26.48s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.02933', 'grad_norm': '0.09895', 'learning_rate': '4.195e-08', 'ppl': '1.03', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '100.9', 'tokens/total': 3842048, 'tokens/trainable': 2564591, 'epoch': '10.31'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 237/242 [1:44:02<02:12, 26.48s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 238/242 [1:44:29<01:46, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03715', 'grad_norm': '0.09533', 'learning_rate': '2.914e-08', 'ppl': '1.038', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '106.7', 'tokens/total': 3858432, 'tokens/trainable': 2575549, 'epoch': '10.35'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 238/242 [1:44:29<01:46, 26.53s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 239/242 [1:44:55<01:19, 26.53s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03608', 'grad_norm': '0.1043', 'learning_rate': '1.865e-08', 'ppl': '1.037', 'memory/max_active (GiB)': '64.84', 'memory/max_allocated (GiB)': '64.84', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '99.97', 'tokens/total': 3874816, 'tokens/trainable': 2586453, 'epoch': '10.4'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 239/242 [1:44:55<01:19, 26.53s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 240/242 [1:45:22<00:53, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03286', 'grad_norm': '0.09626', 'learning_rate': '1.049e-08', 'ppl': '1.033', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '105.1', 'tokens/total': 3891200, 'tokens/trainable': 2597346, 'epoch': '10.44'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 240/242 [1:45:22<00:53, 26.54s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 241/242 [1:45:48<00:26, 26.54s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.03487', 'grad_norm': '0.08951', 'learning_rate': '4.664e-09', 'ppl': '1.035', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '101.7', 'tokens/total': 3907584, 'tokens/trainable': 2608204, 'epoch': '10.48'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 241/242 [1:45:48<00:26, 26.54s/it]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242/242 [1:46:15<00:00, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'loss': '0.02892', 'grad_norm': '0.08585', 'learning_rate': '1.166e-09', 'ppl': '1.029', 'memory/max_active (GiB)': '64.8', 'memory/max_allocated (GiB)': '64.8', 'memory/device_reserved (GiB)': '65.39', 'tokens/train_per_sec_per_gpu': '109.9', 'tokens/total': 3923968, 'tokens/trainable': 2618861, 'epoch': '10.53'}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242/242 [1:46:15<00:00, 26.56s/it]                                                                                                                                                                                                                                                                                                                                                                                                                                                       {'train_runtime': '6376', 'train_samples_per_second': '0.304', 'train_steps_per_second': '0.038', 'train_loss': '0.9716', 'memory/max_active (GiB)': '59.83', 'memory/max_allocated (GiB)': '59.83', 'memory/device_reserved (GiB)': '65.39', 'epoch': '10.53', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242/242 [1:46:15<00:00, 26.56s/it]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242/242 [1:46:15<00:00, 26.35s/it]
[2026-04-29 02:16:15,703] [INFO] [axolotl.train.save_trained_model:241] [PID:3345] Training completed! Saving trained model to ./Writer-Stage-2.
[2026-04-29 02:16:16,840] [INFO] [axolotl.train.save_trained_model:355] [PID:3345] Model successfully saved to ./Writer-Stage-2